MalwareAnalysis_Scripts/PDF_Analyze_Bin_Extractor.py at main · malwaredev/MalwareAnalysis_Scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import hashlib
import os
import re
from collections import Counter

def analyze_pdf(file_path):
    with open(file_path, 'rb') as file:
        pdf_content = file.read()

    # Embedded JavaScript
    js_pattern = b'/JS\s*\d+\s*0\s*obj'
    js_count = len(re.findall(js_pattern, pdf_content))

    # Checksum
    checksum = hashlib.md5(pdf_content).hexdigest()

    # Size
    size = os.path.getsize(file_path)

    # Binary
    binary_pattern = b'[\x00-\x08\x0e-\x1f\x7f-\xff]'
    binary_count = len(re.findall(binary_pattern, pdf_content))

    # Objects
    obj_pattern = b'\d+\s+0\s+obj'
    objects = re.findall(obj_pattern, pdf_content)
    object_count = len(objects)

    # Streams
    stream_pattern = b'stream\s*[\s\S]*?\s*endstream'
    streams = re.findall(stream_pattern, pdf_content)
    stream_count = len(streams)

    # URI
    uri_pattern = b'URI\s*\(\s*(\S+)\s*\)'
    uris = re.findall(uri_pattern, pdf_content)
    uri_count = len(uris)

    # Comments
    comment_pattern = b'%[^\n]*'
    comments = re.findall(comment_pattern, pdf_content)
    comment_count = len(comments)

    # Errors
    errors = []
    if js_count > 0:
        errors.append(f"Embedded JavaScript found ({js_count} instances)")
    if binary_count > 0:
        errors.append(f"Binary data found ({binary_count} instances)")
    if object_count > 1000:
        errors.append(f"Excessive number of objects ({object_count})")
    if stream_count > 1000:
        errors.append(f"Excessive number of streams ({stream_count})")
    if uri_count > 0:
        errors.append(f"URI found ({uri_count} instances)")

    # Output
    output = f"File: {os.path.basename(file_path)}\n"
    output += f"Checksum: {checksum}\n"
    output += f"Size: {size} bytes\n"
    output += f"Embedded JavaScript: {js_count}\n"
    output += f"Objects: {object_count}\n"
    output += f"Streams: {stream_count}\n"
    output += f"URI: {uri_count}\n"
    output += f"Comments: {comment_count}\n"
    output += "Errors:\n"
    output += "\n".join(errors) if errors else "None"

    return output
    print (f'output')

# Example usage
pdf_file = '/home/kali/Downloads/mal.pdf'
output_text = analyze_pdf(pdf_file)

output_file = 'pdf_analysis_output.txt'
with open(output_file, 'w', encoding='utf-8') as file:
    file.write(output_text)

print(f"Analysis saved to: {output_file}")

def extract_binary_data(pdf_file):
    with open(pdf_file, 'rb') as file:
        pdf_content = file.read()

    binary_data = b''
    for byte in pdf_content:
        if byte not in [0x0A, 0x0D, 0x20, 0x09]:  # Exclude newline, space, and tab characters
            binary_data += bytes([byte])

    return binary_data

# Example usage
pdf_file = '/home/kali/Downloads/mal.pdf'
binary_data = extract_binary_data(pdf_file)

output_file = 'extracted_binary_data.bin'
with open(output_file, 'wb') as file:
    file.write(binary_data)

print(f"Binary data extracted and saved to: {output_file}")