ableinc · jimzord12 · Apr 8, 2024 · Apr 8, 2024 · Apr 8, 2024 · Apr 8, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,5 @@
 .env
-training_data/
+training_data/
+your_code_base.txt
+your_code_base.pdf
+myenv/
diff --git a/README.md b/README.md
@@ -1,31 +1,45 @@
 # git2txt
 
-Convert all files in git repository to .txt files. This is useful for training LLMs on your codebase.
+Converts all the files of a git repository into .txt files. It also generates a single .txt & .pdf file containing the whole code base. This is useful for training LLMs on your codebase.
 
 ## How to Use
 
 1. Create new .env file by copying example.env
+
 ```shell
 cp example.env .env
 ```
+
 2. Add necessary fields. The default fields are good to start with.
+
 ```bash
-GIT_PROJECT_DIRECTORY=/path/to/git/repo
+GIT_PROJECT_DIRECTORY=/path/to/git/repo (ex. C:\Users\MyUserName\Codebases\GitHub\my-project-name)
 IGNORE_FILES=.env,package-lock.json
 IGNORE_DIRS=.git,.vscode,node_modules
 SAVE_DIRECTORY=training_data
 SKIP_EMPTY_FILES=true
+
+SOURCE_DIR=training_data
+OUTPUT_FILE=your_code_base.txt
+PDF_OUTPUT=your_code_base.pdf
 ```
+
 3. Install dependencies. Using a virtual environment is recommended.
+
 ```shell
 python -m pip install -r requirements.txt
 ```
-4. Run program
+
+4. In the "is_text_file" function, you MUST add the extensions of the file you want to be converted.
+
+5. Run program
+
 ```shell
 python main.py
 ```
-5. You'll see your data files in the ```training_data/``` directory. This will be different if you changed the path via ```SAVE_DIRECTORY``` in ```.env``` file.
 
+6. You'll see your data files in the `training_data/` directory. This will be different if you changed the path via `SAVE_DIRECTORY` in `.env` file.
 
 ## Notes
-- This program requires Python version 3.6 or later. It uses the f-string formatting technique introduced in Python 3.6.
+
+- This program requires Python version 3.6 or later. It uses the f-string formatting technique introduced in Python 3.6.
diff --git a/example.env b/example.env
@@ -1,5 +1,9 @@
-GIT_PROJECT_DIRECTORY=
+GIT_PROJECT_DIRECTORY=C:\Users\jimzord12\Codebases\GitHub\serve-tech
 IGNORE_FILES=.env,package-lock.json
 IGNORE_DIRS=.git,.vscode,node_modules
 SAVE_DIRECTORY=training_data
-SKIP_EMPTY_FILES=true
+SKIP_EMPTY_FILES=true
+
+SOURCE_DIR=training_data
+OUTPUT_FILE=your_code_base.txt
+PDF_OUTPUT=our_code_base.pdf
diff --git a/main.py b/main.py
@@ -2,9 +2,53 @@
 import os
 import hashlib
 import sys
-load_env()
+from reportlab.pdfgen import canvas
 
 
+load_env(env_path=r'.\example.env')
+
+def is_text_file(file_path):
+    text_file_extensions = ['.txt', '.md', '.go', '.py', '.java', '.html', '.css', '.js', '.mod', '.sum']  # Add more as needed
+    return any(file_path.lower().endswith(ext) for ext in text_file_extensions)
+
+def combine_txt_files_and_create_pdf(source_directory, output_file, pdf_output, separator='**'):
+    separator_line = separator * 40 + '\n'
+
+    # Initialize a list to store combined text
+    combined_text = []
+
+    with open(output_file, 'w', encoding='utf-8') as outfile:
+        for root, dirs, files in os.walk(source_directory):
+            for filename in files:
+                if filename.endswith('.txt'):
+                    file_path = os.path.join(root, filename)
+                    with open(file_path, 'r', encoding='utf-8') as infile:
+                        content = infile.read()
+                        combined_text.append(separator_line)
+                        combined_text.append(f"{filename.center(len(separator_line))}\n")
+                        combined_text.append(separator_line)
+                        combined_text.append(content + '\n')
+                        combined_text.append(separator_line)
+
+                        # Write to the TXT file
+                        outfile.writelines([separator_line, f"{filename.center(len(separator_line))}\n", separator_line, content + '\n', separator_line])
+
+    # Write to the PDF file
+    c = canvas.Canvas(pdf_output)
+    text = c.beginText(40, 800)  # Starting position
+    for line in combined_text:
+        # Split the combined text into lines
+        for subline in line.split('\n'):
+            text.textLine(subline.strip())
+            if text.getY() < 40:  # Move to a new page if there's no space
+                c.drawText(text)
+                c.showPage()
+                text = c.beginText(40, 800)
+    c.drawText(text)
+    c.save()
+
+    print(f'All text files have been combined into {output_file} and {pdf_output}')
+
 def ignore_dir(file_path: str) -> bool:
     for _dir in IGNORE_DIRS:
         if _dir in file_path:
@@ -25,9 +69,9 @@ def get_file_path() -> None:
 
 def write_txt(txt_data: str, file_name: str, md5_hash: str) -> None:
     full_path = os.path.join(save_directory, file_name + f'_{md5_hash}.txt')
-    with open(full_path, mode='w') as data:
+    with open(full_path, mode='w', encoding='utf-8') as data:
         data.write(txt_data)
-    print(f'TXT written to: {full_path}')
+    print(f'TXT written to: {full_path}\n')
 
 
 def main() -> None:
@@ -42,9 +86,15 @@ def main() -> None:
     print('Creating TXT...')
     for index, file in enumerate(FILES):
         print(f'File #{index+1}: {file}')
-        # If line is empty, skip it
+
+        #if file is not a text file, skip it
+        if not is_text_file(file):
+            print(f'Skipping: [{os.path.basename(file)}] a (probably) non-text file.\n')
+            continue
+
+        # If file is empty, skip it
         if os.environ.get('SKIP_EMPTY_FILES').upper() == 'TRUE' and os.path.getsize(file) == 0:
-            print('FILE IS EMPTY. SKIPPING.')
+            print('FILE IS EMPTY. SKIPPING.\n')
             continue
         with open(file, mode='r', encoding='utf-8') as git_file:
             md5_hash = hashlib.md5(git_file.read().encode('utf-8')).hexdigest()
@@ -68,3 +118,9 @@ def main() -> None:
         os.makedirs(save_directory, exist_ok=True)
     main()
     print(f'Training data can be found in {save_directory}/ directory.')
+
+    # My Code
+    source_dir = os.environ.get('SOURCE_DIR')  # Change this to your source directory
+    output_file = os.environ.get('OUTPUT_FILE')  # The final combined text file
+    pdf_output = os.environ.get('PDF_OUTPUT')  # The final PDF file
+    combine_txt_files_and_create_pdf(source_dir, output_file, pdf_output)
diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,6 @@
-pydotenvs==0.2.0
+chardet==5.2.0
+click==8.1.7
+colorama==0.4.6
+pillow==10.3.0
+pydotenvs==0.2.0
+reportlab==4.1.0