-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPDF_Analyze_Bin_Extractor.py
More file actions
102 lines (80 loc) · 2.87 KB
/
PDF_Analyze_Bin_Extractor.py
File metadata and controls
102 lines (80 loc) · 2.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import hashlib
import os
import re
from collections import Counter
def analyze_pdf(file_path):
with open(file_path, 'rb') as file:
pdf_content = file.read()
# Embedded JavaScript
js_pattern = b'/JS\s*\d+\s*0\s*obj'
js_count = len(re.findall(js_pattern, pdf_content))
# Checksum
checksum = hashlib.md5(pdf_content).hexdigest()
# Size
size = os.path.getsize(file_path)
# Binary
binary_pattern = b'[\x00-\x08\x0e-\x1f\x7f-\xff]'
binary_count = len(re.findall(binary_pattern, pdf_content))
# Objects
obj_pattern = b'\d+\s+0\s+obj'
objects = re.findall(obj_pattern, pdf_content)
object_count = len(objects)
# Streams
stream_pattern = b'stream\s*[\s\S]*?\s*endstream'
streams = re.findall(stream_pattern, pdf_content)
stream_count = len(streams)
# URI
uri_pattern = b'URI\s*\(\s*(\S+)\s*\)'
uris = re.findall(uri_pattern, pdf_content)
uri_count = len(uris)
# Comments
comment_pattern = b'%[^\n]*'
comments = re.findall(comment_pattern, pdf_content)
comment_count = len(comments)
# Errors
errors = []
if js_count > 0:
errors.append(f"Embedded JavaScript found ({js_count} instances)")
if binary_count > 0:
errors.append(f"Binary data found ({binary_count} instances)")
if object_count > 1000:
errors.append(f"Excessive number of objects ({object_count})")
if stream_count > 1000:
errors.append(f"Excessive number of streams ({stream_count})")
if uri_count > 0:
errors.append(f"URI found ({uri_count} instances)")
# Output
output = f"File: {os.path.basename(file_path)}\n"
output += f"Checksum: {checksum}\n"
output += f"Size: {size} bytes\n"
output += f"Embedded JavaScript: {js_count}\n"
output += f"Objects: {object_count}\n"
output += f"Streams: {stream_count}\n"
output += f"URI: {uri_count}\n"
output += f"Comments: {comment_count}\n"
output += "Errors:\n"
output += "\n".join(errors) if errors else "None"
return output
print (f'output')
# Example usage
pdf_file = '/home/kali/Downloads/mal.pdf'
output_text = analyze_pdf(pdf_file)
output_file = 'pdf_analysis_output.txt'
with open(output_file, 'w', encoding='utf-8') as file:
file.write(output_text)
print(f"Analysis saved to: {output_file}")
def extract_binary_data(pdf_file):
with open(pdf_file, 'rb') as file:
pdf_content = file.read()
binary_data = b''
for byte in pdf_content:
if byte not in [0x0A, 0x0D, 0x20, 0x09]: # Exclude newline, space, and tab characters
binary_data += bytes([byte])
return binary_data
# Example usage
pdf_file = '/home/kali/Downloads/mal.pdf'
binary_data = extract_binary_data(pdf_file)
output_file = 'extracted_binary_data.bin'
with open(output_file, 'wb') as file:
file.write(binary_data)
print(f"Binary data extracted and saved to: {output_file}")