-
Notifications
You must be signed in to change notification settings - Fork 11
OD-1836 Convert Automated Archive output to working schema #85
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
d4f0b20
f8c98b4
8ff4a68
4a08b7e
b02e293
53415c0
cac604a
afb8e65
d5e4ab1
75d8f65
30733a7
c6731cd
8dfed1f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,7 +22,6 @@ | |
| args.temp_db_database | ||
| ) | ||
| ) | ||
| tags.create_tags_table() | ||
|
|
||
| tag_col_list = {} | ||
| stories_id_name = "" | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,10 +1,10 @@ | ||
| # -- coding: utf-8 -- | ||
|
|
||
| import datetime | ||
| from datetime import datetime | ||
| import codecs | ||
| import re | ||
| import os | ||
| from html.parser import HTMLParser | ||
| import html | ||
| import urllib.request | ||
|
|
||
| from pymysql import connect | ||
|
|
||
|
|
@@ -22,11 +22,15 @@ def _clean_file(filepath, log): | |
| :param filepath: Path to ARCHIVE_DB.pl | ||
| :return: Python dictionary keyed by original story id | ||
| """ | ||
| h = HTMLParser() | ||
| archive_db = codecs.open(filepath, "r", encoding="utf-8").read() | ||
| encoding = input( | ||
| 'Encoding for the ARCHIVE_DB.pl file, e.g. "utf-8", "latin_1", "cp1252" (default: "utf-8"): ' | ||
| ) | ||
| if encoding is None or encoding == "": | ||
| encoding = "utf-8" | ||
| archive_db = codecs.open(filepath, "r", encoding=encoding).read() | ||
|
|
||
| # Manually escape single quote entity and reformat file as a Python dictionary | ||
| step1 = h.unescape(archive_db.replace("'", "\\'")) | ||
| step1 = html.unescape(archive_db.replace("'", "\\'")) | ||
|
|
||
| # Indent the file with a single tab instead of whatever is currently used | ||
| step15 = re.sub(r"^\s+", "\t", step1) | ||
|
|
@@ -122,8 +126,32 @@ def _extract_fandoms(args, record): | |
| return tags.strip(", ") | ||
|
|
||
|
|
||
| def _extract_date(args, record, log): | ||
| date_string = record.get( | ||
| "PrintTime", | ||
| record.get( | ||
| "DatePrint", | ||
| record.get("Date", str(datetime.now().strftime("%m/%d/%y"))), | ||
| ), | ||
| ) | ||
|
|
||
| dt = None | ||
| try: | ||
| # If the date is in the form of a Unix timestamp | ||
| if date_string.isdigit(): | ||
| dt = datetime.fromtimestamp(int(date_string)) | ||
| else: | ||
| dt = datetime.strptime(date_string, "%m/%d/%y") | ||
| except Exception as e: | ||
| log.error( | ||
| f"Failed to parse date value '{date_string}' due to exception: {str(e)}" | ||
| ) | ||
|
|
||
| return dt.strftime("%Y-%m-%d") if dt else "" | ||
|
|
||
|
|
||
| def _create_mysql(args, FILES, log): | ||
| db = connect(args.db_host, args.db_user, args.db_password, "") | ||
| db = connect(host=args.db_host, user=args.db_user, password=args.db_password, db="") | ||
| cursor = db.cursor() | ||
| DATABASE_NAME = args.temp_db_database | ||
|
|
||
|
|
@@ -132,12 +160,13 @@ def _create_mysql(args, FILES, log): | |
| cursor.execute("create database {0};".format(DATABASE_NAME)) | ||
| cursor.execute("use {0}".format(DATABASE_NAME)) | ||
|
|
||
| sql = Sql(args) | ||
| codepath = os.path.dirname(os.path.realpath(__file__)) | ||
| # Instead of duplicating this file in the repo grab it from the master branch of eFiction | ||
| url = "https://raw.githubusercontent.com/otwcode/open-doors-eFiction/refs/heads/master/opendoors/open-doors-tables-working.sql" | ||
| with urllib.request.urlopen(url) as response: | ||
| script = response.read().decode() | ||
|
|
||
| sql.run_script_from_file( | ||
| codepath + "/shared_python/create-open-doors-tables.sql", database=DATABASE_NAME | ||
| ) | ||
| sql = Sql(args, log) | ||
| sql.run_sql_file(script, database=DATABASE_NAME) | ||
| db.commit() | ||
|
|
||
| authors = [ | ||
|
|
@@ -164,26 +193,17 @@ def _create_mysql(args, FILES, log): | |
| FILES[i].get("Summary", "").replace("'", "\\'"), | ||
| _extract_tags(args, FILES[i]), | ||
| _extract_characters(args, FILES[i]), | ||
| datetime.datetime.strptime( | ||
| FILES[i].get( | ||
| "PrintTime", | ||
| FILES[i].get( | ||
| "DatePrint", | ||
| FILES[i].get( | ||
| "Date", str(datetime.datetime.now().strftime("%m/%d/%y")) | ||
| ), | ||
| ), | ||
| ), | ||
| "%m/%d/%y", | ||
| ).strftime("%Y-%m-%d"), | ||
| _extract_date(args, FILES[i], log), | ||
| FILES[i].get("Location", "").replace("'", "\\'"), | ||
| FILES[i] | ||
| .get("LocationURL", FILES[i].get("StoryURL", "")) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder why these lines weren't indented, or is that a Python thing I've forgotten?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. All the items in this array have the same indentation, though it is kinda confusing that the ruff formatter broke up certain lines but not others (like the Location line vs the LocationURL line). So it may look odd but it is fine, unless you're talking about something else?
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just about it looking odd and more difficult for a human to interpret since continuation lines are usually indented relative to their first line. However, this file isn't a high priority so if ruff is going to be run on the whole repo, we'll have to let it do its thing to avoid noisy diffs! |
||
| .replace("'", "\\'"), | ||
| FILES[i].get("Notes", "").replace("'", "\\'"), | ||
| _extract_relationships(args, FILES[i]), | ||
| FILES[i].get("Rating", ""), | ||
| FILES[i].get("Warnings", "").replace("'", "\\'"), | ||
| FILES[i] | ||
| .get("Warnings", FILES[i].get("OptionalWarnings", "")) | ||
| .replace("'", "\\'"), | ||
| FILES[i].get("Author", "").strip(), | ||
| FILES[i].get("Email", FILES[i].get("EmailAuthor", "")).lower().strip(), | ||
| FILES[i].get("FileType", args.chapters_file_extensions) | ||
|
|
@@ -196,6 +216,7 @@ def _create_mysql(args, FILES, log): | |
|
|
||
| cur = 0 | ||
| total = len(FILES) | ||
| item_dict = {} | ||
| for ( | ||
| original_id, | ||
| title, | ||
|
|
@@ -225,7 +246,7 @@ def _create_mysql(args, FILES, log): | |
| table_name = "stories" | ||
| else: | ||
| filename = url | ||
| table_name = "bookmarks" | ||
| table_name = "story_links" | ||
|
|
||
| # Clean up fandoms and add default fandom if it exists | ||
| final_fandoms = fandoms.replace("'", r"\'") | ||
|
|
@@ -241,10 +262,14 @@ def _create_mysql(args, FILES, log): | |
| if element[1] == author and element[2] == email | ||
| ] | ||
| authorid = result[0][0] | ||
| item_dict[original_id] = { | ||
| "authorid": authorid, | ||
| "itemtype": "story_link" if table_name == "story_links" else "story", | ||
| } | ||
|
|
||
| stor = """ | ||
| INSERT INTO {0} (id, fandoms, title, summary, tags, characters, date, url, notes, relationships, rating, warnings, author_id) | ||
| VALUES({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}', '{10}', '{11}', '{12}', '{13}');\n""".format( | ||
| INSERT INTO {0} (id, fandoms, title, summary, tags, characters, date, url, notes, relationships, rating, warnings) | ||
| VALUES({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}', '{10}', '{11}', '{12}');\n""".format( | ||
| table_name, | ||
| original_id, | ||
| final_fandoms.replace(r"\\", "\\"), | ||
|
|
@@ -258,7 +283,6 @@ def _create_mysql(args, FILES, log): | |
| pairings, | ||
| rating, | ||
| warnings, | ||
| authorid, | ||
| ) | ||
| cursor.execute(stor) | ||
| except: | ||
|
|
@@ -285,6 +309,21 @@ def _create_mysql(args, FILES, log): | |
| raise | ||
| db.commit() | ||
|
|
||
| for itemid, item_info in item_dict.items(): | ||
| try: | ||
| item_auth = """ | ||
| INSERT INTO item_authors (author_id, item_id, item_type) | ||
| VALUES({0}, {1}, '{2}');\n""".format( | ||
| item_info["authorid"], itemid, item_info["itemtype"] | ||
| ) | ||
| cursor.execute(item_auth) | ||
| except: | ||
| log.error( | ||
| f"Failed to insert item_authors for {item_info['itemtype']} {itemid} with author {item_info['authorid']}" | ||
| ) | ||
| raise | ||
| db.commit() | ||
|
|
||
|
|
||
| def clean_and_load_data(args, log): | ||
| data = _clean_file(args.db_input_file, log) | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.