|
| 1 | +""" |
| 2 | +A module for spotting suspicious commands using the embeddings |
| 3 | +from our local LLM and a futher ANN categorisier. |
| 4 | +
|
| 5 | +The classes in here are not used for inference. The split is |
| 6 | +because we don't want to install torch on a docker, it is too |
| 7 | +big. So we train the model on a local machine and then use the |
| 8 | +generated onnx file for inference on the docker. |
| 9 | +""" |
| 10 | + |
| 11 | +import os |
| 12 | + |
| 13 | +import torch |
| 14 | +from torch import nn |
| 15 | + |
| 16 | +from codegate.config import Config |
| 17 | +from codegate.inference.inference_engine import LlamaCppInferenceEngine |
| 18 | +from codegate.pipeline.suspicious_commands.suspicious_commands import SuspiciousCommands |
| 19 | + |
| 20 | + |
| 21 | +class SimpleNN(nn.Module): |
| 22 | + """ |
| 23 | + A simple neural network with one hidden layer. |
| 24 | +
|
| 25 | + Attributes: |
| 26 | + network (nn.Sequential): The neural network layers. |
| 27 | + """ |
| 28 | + |
| 29 | + def __init__(self, input_dim=1, hidden_dim=128, num_classes=2): |
| 30 | + """ |
| 31 | + Initialize the SimpleNN model. The default args should be ok, |
| 32 | + but the input_dim must match the incoming training data. |
| 33 | +
|
| 34 | + Args: |
| 35 | + input_dim (int): Dimension of the input features. |
| 36 | + hidden_dim (int): Dimension of the hidden layer. |
| 37 | + num_classes (int): Number of output classes. |
| 38 | + """ |
| 39 | + super(SimpleNN, self).__init__() |
| 40 | + self.network = nn.Sequential( |
| 41 | + nn.Linear(input_dim, hidden_dim), |
| 42 | + nn.ReLU(), |
| 43 | + nn.Dropout(0.2), |
| 44 | + nn.Linear(hidden_dim, hidden_dim // 2), |
| 45 | + nn.ReLU(), |
| 46 | + nn.Dropout(0.2), |
| 47 | + nn.Linear(hidden_dim // 2, num_classes), |
| 48 | + ) |
| 49 | + |
| 50 | + def forward(self, x): |
| 51 | + """ |
| 52 | + Forward pass through the network. |
| 53 | + """ |
| 54 | + return self.network(x) |
| 55 | + |
| 56 | + |
| 57 | +class SuspiciousCommandsTrainer(SuspiciousCommands): |
| 58 | + """ |
| 59 | + Class to train suspicious command detection using a neural network. |
| 60 | +
|
| 61 | + Attributes: |
| 62 | + model_path (str): Path to the model. |
| 63 | + inference_engine (LlamaCppInferenceEngine): Inference engine for |
| 64 | + embedding. |
| 65 | + simple_nn (SimpleNN): Neural network model. |
| 66 | + """ |
| 67 | + |
| 68 | + _instance = None |
| 69 | + |
| 70 | + @staticmethod |
| 71 | + def get_instance(model_file=None): |
| 72 | + """ |
| 73 | + Get the singleton instance of SuspiciousCommands. Initialize and load |
| 74 | + from file on the first call if it has not been done. |
| 75 | +
|
| 76 | + Args: |
| 77 | + model_file (str, optional): The file name to load the model from. |
| 78 | +
|
| 79 | + Returns: |
| 80 | + SuspiciousCommands: The singleton instance. |
| 81 | + """ |
| 82 | + if SuspiciousCommands._instance is None: |
| 83 | + SuspiciousCommands._instance = SuspiciousCommands() |
| 84 | + if model_file is None: |
| 85 | + current_file_path = os.path.dirname(os.path.abspath(__file__)) |
| 86 | + model_file = os.path.join(current_file_path, "simple_nn_model.onnx") |
| 87 | + SuspiciousCommands._instance.load_trained_model(model_file) |
| 88 | + return SuspiciousCommands._instance |
| 89 | + |
| 90 | + def __init__(self): |
| 91 | + """ |
| 92 | + Initialize the SuspiciousCommands class. |
| 93 | + """ |
| 94 | + conf = Config.get_config() |
| 95 | + if conf and conf.model_base_path and conf.embedding_model: |
| 96 | + self.model_path = f"{conf.model_base_path}/{conf.embedding_model}" |
| 97 | + else: |
| 98 | + self.model_path = "" |
| 99 | + self.inference_engine = LlamaCppInferenceEngine() |
| 100 | + self.simple_nn = None # Initialize to None, will be created in train |
| 101 | + |
| 102 | + async def train(self, phrases, labels): |
| 103 | + """ |
| 104 | + Train the neural network with given phrases and labels. |
| 105 | +
|
| 106 | + Args: |
| 107 | + phrases (list of str): List of phrases to train on. |
| 108 | + labels (list of int): Corresponding labels for the phrases. |
| 109 | + """ |
| 110 | + embeds = await self.inference_engine.embed(self.model_path, phrases) |
| 111 | + if isinstance(embeds[0], list): |
| 112 | + embedding_dim = len(embeds[0]) |
| 113 | + else: |
| 114 | + raise ValueError("Embeddings should be a list of lists of floats") |
| 115 | + |
| 116 | + self.simple_nn = SimpleNN(input_dim=embedding_dim) |
| 117 | + criterion = nn.CrossEntropyLoss() |
| 118 | + optimizer = torch.optim.Adam(self.simple_nn.parameters(), lr=0.001) |
| 119 | + |
| 120 | + # Training loop |
| 121 | + for _ in range(100): |
| 122 | + for data, label in zip(embeds, labels): |
| 123 | + data = torch.FloatTensor(data) # convert to tensor |
| 124 | + label = torch.LongTensor([label]) # convert to tensor |
| 125 | + |
| 126 | + optimizer.zero_grad() |
| 127 | + outputs = self.simple_nn(data) |
| 128 | + loss = criterion(outputs.unsqueeze(0), label) |
| 129 | + loss.backward() |
| 130 | + optimizer.step() |
| 131 | + |
| 132 | + def save_model(self, file_name): |
| 133 | + """ |
| 134 | + Save the trained model to a file. |
| 135 | +
|
| 136 | + Args: |
| 137 | + file_name (str): The file name to save the model. |
| 138 | + """ |
| 139 | + if self.simple_nn is not None: |
| 140 | + # Create a dummy input with the correct embedding dimension |
| 141 | + dummy_input = torch.randn(1, self.simple_nn.network[0].in_features) |
| 142 | + torch.onnx.export( |
| 143 | + self.simple_nn, |
| 144 | + dummy_input, |
| 145 | + file_name, |
| 146 | + input_names=["input"], |
| 147 | + output_names=["output"], |
| 148 | + ) |
0 commit comments