Malware_Classification/utils.py at main · rickyxume/Malware_Classification · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from nlpaug.util import Method
from nlpaug.augmenter.char import CharAugmenter
import random
import numpy as np
np.random.seed(42)

# edited from: https://github.com/makcedward/nlpaug/blob/master/example/custom_augmenter.ipynb

""" 随机替换同义opcode """


class CustomCharAug(CharAugmenter):
    def __init__(self, name='CustChar_Aug', min_char=2, aug_char_min=1, aug_char_max=10, aug_char_p=0.3,
                 aug_word_min=1, aug_word_max=10, aug_word_p=0.3, tokenizer=None, reverse_tokenizer=None,
                 stopwords=None, verbose=0, stopwords_regex=None):
        super().__init__(
            name=name, action="substitute", min_char=min_char, aug_char_min=aug_char_min,
            aug_char_max=aug_char_max, aug_char_p=aug_char_p, aug_word_min=aug_word_min,
            aug_word_max=aug_word_max, aug_word_p=aug_word_p, tokenizer=tokenizer,
            reverse_tokenizer=reverse_tokenizer, stopwords=stopwords, device='cpu',
            verbose=verbose, stopwords_regex=stopwords_regex)
        self.model = replaceModel()

    def substitute(self, data):
        results = []
        tokens = self.tokenizer(data)
        # print(f"tokens:{tokens}")
        # Get target tokens
        aug_token_idxes = self._get_aug_idxes(
            tokens, self.aug_word_min, self.aug_word_max, self.aug_word_p, Method.WORD)
        aug_token_idxes = np.array(sorted(aug_token_idxes))
        thr = 8  # 前面8-1个词频少的不替换
        a = np.array(aug_token_idxes > thr)
        c = np.nonzero(a)
        aug_token_idxes = aug_token_idxes[a]
        if len(aug_token_idxes) < 1:  # 如果太少了没有替换的那就随便换一个
            len_tokens = len(tokens)
            if len_tokens > 8:
                aug_token_idxes = [random.randint(8, len_tokens)]

        # print(f"aug_token_idxes:{aug_token_idxes}")
        for token_i, token in enumerate(tokens):
            if token_i not in aug_token_idxes:  # 如果不是待替换词，存进results
                results.append(token)
                continue
            else:
                result = ''
                pred = self.sample(self.model.predict(tokens[token_i]), 1)[0]
                if pred:
                    result += pred
                # print(f'result:{result}')
                results.append(result)
                # print(f'results:{results}')
        return self.reverse_tokenizer(results)


class replaceModel:
    def __init__(self):
        self.opcode_synonyms_dict = {
            'je': ['je', 'jz'], 'jz': ['je', 'jz'],
            'ins': ['in', 'ins'], 'in': ['in', 'ins'],
            'outs': ['out', 'outs'], 'out': ['out', 'outs'],
            'fistp': ['fistp', 'fstp'], 'fstp': ['fistp', 'fstp'],
            # 'jmp':['jg','jl','jb','jnb','jno','jo'],
            'faddp': ['faddp', 'add', 'daa'], 'add': ['faddp', 'add', 'daa'], 'daa': ['faddp', 'add', 'daa'],
            'sub': ['sub', 'sbb'], 'sbb': ['sub', 'sbb'],
            'imul': ['imul', 'mul'], 'mul': ['imul', 'mul'],
            'fdivr': ['fdiv', 'fdivr'], 'fdiv': ['fdiv', 'fdivr'],
            'shr': ['shr', 'sar'], 'sar': ['shr', 'sar'],
            'sal': ['sal', 'shl'], 'shl': ['sal', 'shl'],
        }

    def predict(self, x):
        if x in self.opcode_synonyms_dict:
            choice_list = np.array(self.opcode_synonyms_dict[x])
            item = np.random.choice(choice_list)
            # print(f"replace:{x}->{item}") #这部分好像和上面的随机sample重复了，懒得改了，能run就行
            return [item]
        else:
            # print(f"no change {x}")
            return [x]

# text = "7937 cdq cld fdiv fword inc jb je jo mul not out outs rcr rol ror sbb scas stc sti stos test xchg daa daa dec dec endp endp jg jg jl jl jnb jnb pop pop ret ret sar sar shl shl xor xor add add add call call call cmp cmp cmp in in in ins ins ins jmp jmp jmp jz jz jz lea lea lea push push push std std std dd dd dd dd dw dw dw dw mov mov mov mov or or or or sub sub sub sub"
# aug = CustomCharAug()
# text_aug = aug.augment(text)