Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/python-app-macos-windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ macos-latest, windows-latest ]
os: [ macos-13, windows-latest ] # Using macos-13 since macos-latest no longer supports 3.8

steps:
- uses: actions/checkout@v2
Expand Down
1 change: 0 additions & 1 deletion 02b-Extract-Tags-From-Stories.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
args.temp_db_database
)
)
tags.create_tags_table()

tag_col_list = {}
stories_id_name = ""
Expand Down
97 changes: 68 additions & 29 deletions automated_archive/aa.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
# -- coding: utf-8 --

import datetime
from datetime import datetime
import codecs
import re
import os
from html.parser import HTMLParser
import html
import urllib.request

from pymysql import connect

Expand All @@ -22,11 +22,15 @@ def _clean_file(filepath, log):
:param filepath: Path to ARCHIVE_DB.pl
:return: Python dictionary keyed by original story id
"""
h = HTMLParser()
archive_db = codecs.open(filepath, "r", encoding="utf-8").read()
encoding = input(
'Encoding for the ARCHIVE_DB.pl file, e.g. "utf-8", "latin_1", "cp1252" (default: "utf-8"): '
)
if encoding is None or encoding == "":
encoding = "utf-8"
archive_db = codecs.open(filepath, "r", encoding=encoding).read()

# Manually escape single quote entity and reformat file as a Python dictionary
step1 = h.unescape(archive_db.replace("'", "\\'"))
step1 = html.unescape(archive_db.replace("'", "\\'"))

# Indent the file with a single tab instead of whatever is currently used
step15 = re.sub(r"^\s+", "\t", step1)
Expand Down Expand Up @@ -122,8 +126,32 @@ def _extract_fandoms(args, record):
return tags.strip(", ")


def _extract_date(args, record, log):
date_string = record.get(
"PrintTime",
record.get(
"DatePrint",
record.get("Date", str(datetime.now().strftime("%m/%d/%y"))),
),
)

dt = None
try:
# If the date is in the form of a Unix timestamp
if date_string.isdigit():
dt = datetime.fromtimestamp(int(date_string))
else:
dt = datetime.strptime(date_string, "%m/%d/%y")
except Exception as e:
log.error(
f"Failed to parse date value '{date_string}' due to exception: {str(e)}"
)

return dt.strftime("%Y-%m-%d") if dt else ""


def _create_mysql(args, FILES, log):
db = connect(args.db_host, args.db_user, args.db_password, "")
db = connect(host=args.db_host, user=args.db_user, password=args.db_password, db="")
cursor = db.cursor()
DATABASE_NAME = args.temp_db_database

Expand All @@ -132,12 +160,13 @@ def _create_mysql(args, FILES, log):
cursor.execute("create database {0};".format(DATABASE_NAME))
cursor.execute("use {0}".format(DATABASE_NAME))

sql = Sql(args)
codepath = os.path.dirname(os.path.realpath(__file__))
# Instead of duplicating this file in the repo grab it from the master branch of eFiction
url = "https://raw.githubusercontent.com/otwcode/open-doors-eFiction/refs/heads/master/opendoors/open-doors-tables-working.sql"
with urllib.request.urlopen(url) as response:
script = response.read().decode()

sql.run_script_from_file(
codepath + "/shared_python/create-open-doors-tables.sql", database=DATABASE_NAME
)
sql = Sql(args, log)
sql.run_sql_file(script, database=DATABASE_NAME)
db.commit()

authors = [
Expand All @@ -164,26 +193,17 @@ def _create_mysql(args, FILES, log):
FILES[i].get("Summary", "").replace("'", "\\'"),
_extract_tags(args, FILES[i]),
_extract_characters(args, FILES[i]),
datetime.datetime.strptime(
FILES[i].get(
"PrintTime",
FILES[i].get(
"DatePrint",
FILES[i].get(
"Date", str(datetime.datetime.now().strftime("%m/%d/%y"))
),
),
),
"%m/%d/%y",
).strftime("%Y-%m-%d"),
_extract_date(args, FILES[i], log),
FILES[i].get("Location", "").replace("'", "\\'"),
FILES[i]
.get("LocationURL", FILES[i].get("StoryURL", ""))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder why these lines weren't indented, or is that a Python thing I've forgotten?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All the items in this array have the same indentation, though it is kinda confusing that the ruff formatter broke up certain lines but not others (like the Location line vs the LocationURL line). So it may look odd but it is fine, unless you're talking about something else?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just about it looking odd and more difficult for a human to interpret since continuation lines are usually indented relative to their first line. However, this file isn't a high priority so if ruff is going to be run on the whole repo, we'll have to let it do its thing to avoid noisy diffs!

.replace("'", "\\'"),
FILES[i].get("Notes", "").replace("'", "\\'"),
_extract_relationships(args, FILES[i]),
FILES[i].get("Rating", ""),
FILES[i].get("Warnings", "").replace("'", "\\'"),
FILES[i]
.get("Warnings", FILES[i].get("OptionalWarnings", ""))
.replace("'", "\\'"),
FILES[i].get("Author", "").strip(),
FILES[i].get("Email", FILES[i].get("EmailAuthor", "")).lower().strip(),
FILES[i].get("FileType", args.chapters_file_extensions)
Expand All @@ -196,6 +216,7 @@ def _create_mysql(args, FILES, log):

cur = 0
total = len(FILES)
item_dict = {}
for (
original_id,
title,
Expand Down Expand Up @@ -225,7 +246,7 @@ def _create_mysql(args, FILES, log):
table_name = "stories"
else:
filename = url
table_name = "bookmarks"
table_name = "story_links"

# Clean up fandoms and add default fandom if it exists
final_fandoms = fandoms.replace("'", r"\'")
Expand All @@ -241,10 +262,14 @@ def _create_mysql(args, FILES, log):
if element[1] == author and element[2] == email
]
authorid = result[0][0]
item_dict[original_id] = {
"authorid": authorid,
"itemtype": "story_link" if table_name == "story_links" else "story",
}

stor = """
INSERT INTO {0} (id, fandoms, title, summary, tags, characters, date, url, notes, relationships, rating, warnings, author_id)
VALUES({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}', '{10}', '{11}', '{12}', '{13}');\n""".format(
INSERT INTO {0} (id, fandoms, title, summary, tags, characters, date, url, notes, relationships, rating, warnings)
VALUES({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}', '{10}', '{11}', '{12}');\n""".format(
table_name,
original_id,
final_fandoms.replace(r"\\", "\\"),
Expand All @@ -258,7 +283,6 @@ def _create_mysql(args, FILES, log):
pairings,
rating,
warnings,
authorid,
)
cursor.execute(stor)
except:
Expand All @@ -285,6 +309,21 @@ def _create_mysql(args, FILES, log):
raise
db.commit()

for itemid, item_info in item_dict.items():
try:
item_auth = """
INSERT INTO item_authors (author_id, item_id, item_type)
VALUES({0}, {1}, '{2}');\n""".format(
item_info["authorid"], itemid, item_info["itemtype"]
)
cursor.execute(item_auth)
except:
log.error(
f"Failed to insert item_authors for {item_info['itemtype']} {itemid} with author {item_info['authorid']}"
)
raise
db.commit()


def clean_and_load_data(args, log):
data = _clean_file(args.db_input_file, log)
Expand Down
5 changes: 4 additions & 1 deletion shared_python/Chapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,8 @@ def _gather_and_dedupe(self, chapters_path, extensions, has_ids=False):
for cid, duplicate in duplicate_chapters.items():
# look up the author id and add that one to the file_names list
sql_author_id = self.sql.execute_and_fetchall(
"SELECT author_id FROM chapters WHERE id = {0}".format(cid)
self.sql.database,
"SELECT author_id FROM chapters WHERE id = {0}".format(cid),
)
if len(sql_author_id) > 0:
author_id = sql_author_id[0][0]
Expand Down Expand Up @@ -142,6 +143,8 @@ def populate_chapters(self, folder=None, extensions=None):
else:
for _, chapter_path in file_paths.items():
path = chapter_path.replace(self.args.chapters_path, "")[1:]
if os.sep == "\\": # if this script is run on windows
path = path.replace("\\", "/")
with codecs.open(chapter_path, "r", encoding=char_encoding) as c:
try:
cur = Common.print_progress(cur, total)
Expand Down
2 changes: 2 additions & 0 deletions shared_python/Sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,9 @@ def run_script_from_file(self, filename, database, initial_load=False):
fd = open(filename, "r")
sqlFile = fd.read()
fd.close()
self.run_sql_file(sqlFile, database, initial_load)

def run_sql_file(self, sqlFile, database, initial_load=False):
# replace placeholders and return all SQL commands (split on ';')
sqlCommands = sqlFile.replace("$DATABASE$", database).split(";\n")

Expand Down
55 changes: 36 additions & 19 deletions shared_python/Tags.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
from collections import defaultdict
from html.parser import HTMLParser
from logging import Logger

Expand Down Expand Up @@ -83,8 +84,9 @@ def populate_tag_table(
)
)

tags_to_insert = {}
tags_to_story_ids = defaultdict(list)
for story_tags_row in data:
values = []
for col in tag_columns:
needs_fandom = col in tags_with_fandoms
if story_tags_row[col] is not None:
Expand All @@ -93,27 +95,42 @@ def populate_tag_table(
if isinstance(
tag_col_lookup[col], str
): # Probably AA or a custom archive
cleaned_tag = (
val.encode("utf-8").replace("'", "'").strip()
cleaned_tag = re.sub(
r'(?<!\\)"',
'\\"',
val.replace("'", "'").strip(),
)

values.append(
'({0}, "{1}", "{2}", "{3}")'.format(
story_tags_row[story_id_col_name],
re.sub(r'(?<!\\)"', '\\"', cleaned_tag),
tag_col_lookup[col],
story_tags_row["fandoms"]
if needs_fandom
else "",
)
tags_to_story_ids[cleaned_tag].append(
story_tags_row[story_id_col_name]
)
tags_to_insert[
cleaned_tag
] = '("{0}", "{1}", "{2}")'.format(
cleaned_tag,
tag_col_lookup[col],
story_tags_row["fandoms"] if needs_fandom else "",
)

if len(values) > 0:
self.sql.execute(
"""
INSERT INTO tags (storyid, original_tag, original_table, ao3_tag_fandom) VALUES {0}
""".format(", ".join(values))
)
if len(tags_to_insert) > 0:
self.sql.execute(
"""
INSERT INTO tags (original_tag, original_type, ao3_tag_fandom) VALUES {0}
""".format(", ".join(tags_to_insert.values()))
)

tag_data = self.sql.execute_dict("SELECT id, original_tag FROM tags")
for tag_row in tag_data:
story_ids = set(tags_to_story_ids[tag_row["original_tag"]])
for story_id in story_ids:
self.sql.execute(
"""
INSERT INTO item_tags (item_id, item_type, tag_id) VALUES ({0}, "{1}", {2})
""".format(
story_id,
"story_link" if table_name == "story_links" else "story",
tag_row["id"],
)
)

def distinct_tags(self, database):
"""
Expand Down