-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess.py
More file actions
29 lines (21 loc) · 934 Bytes
/
preprocess.py
File metadata and controls
29 lines (21 loc) · 934 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
df = pd.read_csv('cryptographic_dataset.csv')
le = LabelEncoder()
df['algorithm_used'] = le.fit_transform(df['algorithm_used'])
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(df['ciphertext'])
X = tokenizer.texts_to_sequences(df['ciphertext'])
X = pad_sequences(X, maxlen=128) # Set maxlen according to your data
X_train, X_test, y_train, y_test = train_test_split(X, df['algorithm_used'], test_size=0.2, random_state=42)
np.save('X_train.npy', X_train)
np.save('X_test.npy', X_test)
np.save('y_train.npy', y_train)
np.save('y_test.npy', y_test)
import pickle
with open('tokenizer.pkl', 'wb') as f:
pickle.dump(tokenizer, f)