Extract-to-PDF/app.py at main · lethal-dev/Extract-to-PDF · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import os
import pdfplumber
import openpyxl
import re


def extraer_datos_pdf(archivo_pdf):
    with pdfplumber.open(archivo_pdf) as pdf:
        texto = ""
        for page in pdf.pages:
            texto += page.extract_text()
        nss = obtener_valor(texto, "NSS")
        curp = obtener_valor(texto, "CURP")
        rfc = obtener_valor(texto, "RFC")
        return nss, curp, rfc

def obtener_valor(texto, campo):
    if campo == "RFC":
        patron = r"\bRFC:\s*([A-Za-z]{4}\d{6}[A-Za-z\d]{3})"
    else:
        patron = r"\b" + campo + r":\s*(\w+)"

    coincidencias = re.findall(patron, texto, re.IGNORECASE)
    if coincidencias:
        valor = coincidencias[0]
        return valor
    else:
        return ""


carpeta_pdf = "./pdfs"
ruta_carpeta_pdf = os.path.join(os.path.dirname(os.path.abspath(__file__)), carpeta_pdf)
archivos_pdf = [f for f in os.listdir(ruta_carpeta_pdf) if f.endswith(".pdf")]
wb = openpyxl.Workbook()
hoja = wb.active
hoja["A1"] = "NSS"
hoja["B1"] = "CURP"
hoja["C1"] = "RFC"
fila_actual = 2
for archivo_pdf in archivos_pdf:
    try:
        ruta_archivo_pdf = os.path.join(ruta_carpeta_pdf, archivo_pdf)
        nss, curp, rfc = extraer_datos_pdf(ruta_archivo_pdf)
        hoja.cell(row=fila_actual, column=1).value = nss
        hoja.cell(row=fila_actual, column=2).value = curp
        hoja.cell(row=fila_actual, column=3).value = rfc
        fila_actual += 1
    except Exception as e:
        print(f"Error al procesar el archivo '{archivo_pdf}': {str(e)}")
ruta_archivo_excel = os.path.join(os.path.dirname(os.path.abspath(__file__)), "Listado General.xlsx")
wb.save(ruta_archivo_excel)