Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions merge-export-file-script/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Merge Export File Script

This script is designed to merge CSV files from a Datasaur exported ZIP file and output a new ZIP file containing the merged CSVs.

## Prerequisites

- Python 3.x
- Ensure you have the necessary permissions to read/write files in the directories you are working with.

## Installation

- Clone the repository or download the script to your local machine.
-Ensure Python is installed on your system. You can download it from python.org.

## Usage

To run the script, use the following command in your terminal or command prompt:

```bash
python merge.py -I <input_file_path> -O <output_file_path>
```

### Arguments

-I, --input: Required. The path to the input Datasaur exported ZIP file.
-O, --output: Required. The path where the output ZIP file will be saved.

### Example

```bash
python merge.py -I /path/to/input.zip -O /path/to/output.zip
```

This command will:

- Validate the input ZIP file to ensure it exists and is a valid ZIP file.
- Extract the contents of the input ZIP file to a temporary directory.
- Merge all CSV files found in each folder within the extracted contents.
- Create a new ZIP file containing the merged CSV files at the specified output path.
- Clean up the temporary directory used during the process.

## Notes

- Ensure the input file is a valid ZIP file containing CSV files to be merged.
- The output file path should not already exist, as the script will not overwrite existing files.
- The script will create a temporary directory named tmp in the current working directory. Ensure you have write permissions in this directory.

## Troubleshooting

- If you encounter a `FileNotFoundError`, ensure the input file path is correct.
- If you encounter a `FileExistsError`, ensure the output file path does not already exist.
- For any other issues, ensure you have the necessary permissions and that your Python environment is correctly set up.
117 changes: 117 additions & 0 deletions merge-export-file-script/merge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
import argparse
import csv
import os
import shutil
import zipfile


def create_dirs(path):
if not os.path.exists(path):
os.makedirs(path)


def clean_tmp_dir(tmp_dir):
shutil.rmtree(tmp_dir)


def validate_input_file(input_file_path):
if not os.path.exists(input_file_path):
raise FileNotFoundError(f"Input file {input_file_path} does not exist")

if not input_file_path.endswith(".zip"):
raise ValueError(f"Input file {input_file_path} is not a zip file")

if not zipfile.is_zipfile(input_file_path):
raise ValueError(f"Input file {input_file_path} is not a valid zip file")


def validate_output_file(output_file_path):
if os.path.exists(output_file_path):
raise FileExistsError(f"Output file {output_file_path} already exists")

if not output_file_path.endswith(".zip"):
raise ValueError(f"Output file {output_file_path} is not a zip file")


def read_csv_with_dict_reader(csv_file_path):
with open(csv_file_path, "r") as f:
reader = csv.DictReader(f)
return [row for row in reader]


def write_csv_with_dict_writer(csv_file_path, data):
with open(csv_file_path, "w") as f:
writer = csv.DictWriter(f, fieldnames=data[0].keys())
writer.writeheader()
writer.writerows(data)


def merge_csv_files(csv_files):
data = []
for csv_file in csv_files:
data.extend(read_csv_with_dict_reader(csv_file))
return data


def do_merge_csv_files_per_folder(folder_path):
csv_files = [
f"{folder_path}/{file}"
for file in os.listdir(folder_path)
if file.endswith(".csv")
]
data = merge_csv_files(csv_files)
write_csv_with_dict_writer(f"{folder_path}/all_merged.csv", data)


def zip_folder(folder_path, output_path):
with zipfile.ZipFile(output_path, "w") as zipf:
for root, _dirs, files in os.walk(folder_path):
for file in files:
file_path = os.path.join(root, file)
arcname = os.path.relpath(file_path, folder_path)
zipf.write(file_path, arcname)


def extract_zip_file(zip_file_path, output_dir):
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
zip_ref.extractall(output_dir)


def write_zip_file(zip_file_path, file_path):
with zipfile.ZipFile(zip_file_path, "w") as zipf:
zipf.write(file_path, os.path.basename(file_path))


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"-I", "--input", required=True, help="Input Datasaur exported zip file path"
)
parser.add_argument("-O", "--output", required=True, help="Output zip file path")
args = parser.parse_args()

INPUT_ZIP_FILE = args.input
OUTPUT_ZIP_FILE = args.output

validate_input_file(INPUT_ZIP_FILE)
validate_output_file(OUTPUT_ZIP_FILE)

TMP_DIR = "tmp"
create_dirs(TMP_DIR)

extract_zip_file(INPUT_ZIP_FILE, TMP_DIR)

BASE_EXTRACTED_PATH = "tmp/{name}".format(name=os.listdir("tmp")[0])

folders = [
f"{BASE_EXTRACTED_PATH}/{folder}"
for folder in os.listdir(BASE_EXTRACTED_PATH)
if os.path.isdir(os.path.join(BASE_EXTRACTED_PATH, folder))
]

for folder in folders:
do_merge_csv_files_per_folder(folder)

zip_folder(BASE_EXTRACTED_PATH, OUTPUT_ZIP_FILE)

clean_tmp_dir(TMP_DIR)
Binary file added merge-export-file-script/sample-input/sample.zip
Binary file not shown.