Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
setup:
strategy:
matrix:
python-version: ["3.11", "3.12", "3.13"]
python-version: ["3.11", "3.12", "3.13", "3.14"]
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
Expand All @@ -41,7 +41,7 @@ jobs:
lint:
strategy:
matrix:
python-version: ["3.11", "3.12", "3.13"]
python-version: ["3.11", "3.12", "3.13", "3.14"]
runs-on: ubuntu-latest
needs: [setup, changelog]
steps:
Expand Down Expand Up @@ -72,7 +72,7 @@ jobs:
test_unit:
strategy:
matrix:
python-version: ["3.11", "3.12", "3.13"]
python-version: ["3.11", "3.12", "3.13", "3.14"]
runs-on: opensource-linux-8core
needs: [setup, lint]
steps:
Expand Down Expand Up @@ -102,7 +102,7 @@ jobs:
test_unit_no_extras:
strategy:
matrix:
python-version: ["3.11", "3.12", "3.13"]
python-version: ["3.11", "3.12", "3.13", "3.14"]
runs-on: ubuntu-latest
needs: [setup, lint]
steps:
Expand All @@ -125,7 +125,7 @@ jobs:
test_unit_dependency_extras:
strategy:
matrix:
python-version: ["3.11", "3.12", "3.13"]
python-version: ["3.11", "3.12", "3.13", "3.14"]
extra: ["csv", "docx", "odt", "markdown", "pypandoc", "pdf-image", "pptx", "xlsx"]
include:
- extra: csv
Expand Down
89 changes: 47 additions & 42 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[project]
name = "unstructured"
description = "A library that prepares raw documents for downstream ML tasks."
requires-python = ">=3.11, <3.14"
requires-python = ">=3.11, <3.15"
authors = [{name = "Unstructured Technologies", email = "devops@unstructuredai.io"}]
classifiers = [
"Development Status :: 4 - Beta",
Expand All @@ -14,6 +14,7 @@ classifiers = [
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Programming Language :: Python :: 3.14",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
]
readme = "README.md"
Expand All @@ -22,28 +23,28 @@ keywords = ["NLP", "PDF", "HTML", "CV", "XML", "parsing", "preprocessing"]
dynamic = ["version"]
dependencies = [
"beautifulsoup4>=4.14.3, <5.0.0",
"charset-normalizer>=3.4.4, <4.0.0",
"charset-normalizer>=3.4.6, <4.0.0",
"emoji>=2.15.0, <3.0.0",
"filetype>=1.2.0, <2.0.0",
"html5lib>=1.1, <2.0.0",
"langdetect>=1.0.9, <2.0.0",
"lxml>=5.0.0, <7.0.0",
"spacy>=3.7.0, <4.0.0",
"installer>=0.7.0, <1.0.0",
"numba>=0.60.0, <1.0.0",
"numpy>=1.26.0, <3.0.0",
"lxml>=6.0.2, <7.0.0",
"spacy>=3.8.13, <4.0.0",
"installer>=1.0.0, <2.0.0",
"numba>=0.64.0, <1.0.0",
"numpy>=2.4.4, <3.0.0",
"psutil>=7.2.2, <8.0.0",
"python-iso639>=2026.1.31, <2027.0.0",
"python-magic>=0.4.27, <1.0.0",
"python-oxmsg>=0.0.2, <1.0.0",
"rapidfuzz>=3.14.3, <4.0.0",
"regex>=2024.0.0, <2027.0.0",
"requests>=2.32.5, <3.0.0",
"regex>=2026.3.32, <2027.0.0",
"requests>=2.33.1, <3.0.0",
"tqdm>=4.67.3, <5.0.0",
"typing-extensions>=4.15.0, <5.0.0",
"unstructured-client>=0.25.9, <1.0.0",
"wrapt>=2.1.1, <3.0.0",
"filelock>=3.12.0,<4.0.0",
"unstructured-client>=0.42.12, <1.0.0",
"wrapt>=2.1.2, <3.0.0",
"filelock>=3.25.2,<4.0.0",
]

[project.optional-dependencies]
Expand All @@ -58,32 +59,32 @@ docx = [
"python-docx>=1.2.0, <2.0.0",
]
epub = [
"pypandoc-binary>=1.16.2, <2.0.0; platform_system != 'Windows'",
"pypandoc-binary>=1.16.2, <2.0.0; platform_system == 'Windows' and python_version < '3.13'",
"pypandoc-binary>=1.17.0, <2.0.0; platform_system != 'Windows'",
"pypandoc-binary>=1.17.0, <2.0.0; platform_system == 'Windows' and python_version < '3.14'",
]
image = [
"google-cloud-vision>=3.12.1, <4.0.0",
"google-cloud-vision>=3.13.0, <4.0.0",
"pdf2image>=1.17.0, <2.0.0",
"pdfminer.six>=20251230, <20270000",
"pi-heif>=1.2.0, <2.0.0",
"pikepdf>=10.3.0, <11.0.0",
"pypdf>=6.6.2, <7.0.0",
"pdfminer.six>=20260107, <20270000",
"pi-heif>=1.3.0, <2.0.0",
"pikepdf>=10.5.1, <11.0.0",
"pypdf>=6.9.2, <7.0.0",
"unstructured-inference>=1.6.2, <2.0.0; platform_system != 'Windows' and python_version >= '3.12'",
"unstructured-inference>=1.2.0, <2.0.0; platform_system != 'Windows' and python_version < '3.12'",
"unstructured-inference>=1.6.2, <2.0.0; platform_system == 'Windows' and python_version >= '3.12' and python_version < '3.13'",
"unstructured-pytesseract>=0.3.15, <1.0.0",
]
md = [
"markdown>=3.10.1, <4.0.0",
"markdown>=3.10.2, <4.0.0",
]
odt = [
"pypandoc-binary>=1.16.2, <2.0.0; platform_system != 'Windows'",
"pypandoc-binary>=1.16.2, <2.0.0; platform_system == 'Windows' and python_version < '3.13'",
"pypandoc-binary>=1.17.0, <2.0.0; platform_system != 'Windows'",
"pypandoc-binary>=1.17.0, <2.0.0; platform_system == 'Windows' and python_version < '3.14'",
"python-docx>=1.2.0, <2.0.0",
]
org = [
"pypandoc-binary>=1.16.2, <2.0.0; platform_system != 'Windows'",
"pypandoc-binary>=1.16.2, <2.0.0; platform_system == 'Windows' and python_version < '3.13'",
"pypandoc-binary>=1.17.0, <2.0.0; platform_system != 'Windows'",
"pypandoc-binary>=1.17.0, <2.0.0; platform_system == 'Windows' and python_version < '3.14'",
]
pdf = [
"unstructured[image]",
Expand All @@ -95,26 +96,26 @@ pptx = [
"python-pptx>=1.0.2, <2.0.0",
]
rtf = [
"pypandoc-binary>=1.16.2, <2.0.0; platform_system != 'Windows'",
"pypandoc-binary>=1.16.2, <2.0.0; platform_system == 'Windows' and python_version < '3.13'",
"pypandoc-binary>=1.17.0, <2.0.0; platform_system != 'Windows'",
"pypandoc-binary>=1.17.0, <2.0.0; platform_system == 'Windows' and python_version < '3.14'",
]
rst = [
"pypandoc-binary>=1.16.2, <2.0.0; platform_system != 'Windows'",
"pypandoc-binary>=1.16.2, <2.0.0; platform_system == 'Windows' and python_version < '3.13'",
"pypandoc-binary>=1.17.0, <2.0.0; platform_system != 'Windows'",
"pypandoc-binary>=1.17.0, <2.0.0; platform_system == 'Windows' and python_version < '3.14'",
]
tsv = [
"unstructured[csv]",
]
xlsx = [
"msoffcrypto-tool>=6.0.0, <7.0.0",
"networkx>=3.2.0, <4.0.0",
"networkx>=3.6.1, <4.0.0",
"openpyxl>=3.1.5, <4.0.0",
"pandas>=2.0.0, <4.0.0",
"xlrd>=2.0.1, <3.0.0",
"xlrd>=2.0.2, <3.0.0",
]
# Speech-to-text for partition_audio (multimodal: audio -> elements)
audio = [
"openai-whisper>=20231117, <20270000",
"openai-whisper>=20250625, <20270000",
]
all-docs = [
"unstructured[audio,csv,doc,docx,epub,image,md,odt,org,pdf,ppt,pptx,rtf,rst,tsv,xlsx]",
Expand All @@ -124,22 +125,23 @@ chunking-tokens = [
"tiktoken>=0.12.0, <1.0.0",
]
huggingface = [
"sentencepiece>=0.2.0, <1.0.0",
"sentencepiece>=0.2.1, <1.0.0",
"torch>=2.10.0, <3.0.0; platform_system != 'Windows'",
"torch>=2.10.0, <3.0.0; platform_system == 'Windows' and python_version < '3.13'",
"transformers>=5.2.0, <6.0.0",
"torch>=2.10.0, <3.0.0; platform_system == 'Windows' and python_version < '3.14'",
"transformers>=5.4.0, <6.0.0",
]
local-inference = [
"unstructured[all-docs]",
]
paddleocr = [
"paddlepaddle>=3.3.0, <4.0.0; platform_machine != 'aarch64' and platform_system != 'Windows'",
"paddlepaddle>=3.3.0, <4.0.0; platform_system == 'Windows' and python_version < '3.13'",
"paddlepaddle>=3.3.1, <4.0.0; platform_machine != 'aarch64' and platform_system != 'Windows'",
"paddlepaddle>=3.3.1, <4.0.0; platform_system == 'Windows' and python_version < '3.14'",
"unstructured-paddleocr==2.10.0",
]
ingest = [
"unstructured-ingest[airtable,astradb,azure,azure-ai-search,bedrock,biomed,box,chroma,confluence,couchbase,databricks-volumes,delta-table,discord,dropbox,elasticsearch,gcs,github,gitlab,google-drive,hubspot,huggingface,jira,kafka,kdbai,milvus,mongodb,notion,octoai,onedrive,openai,opensearch,outlook,pinecone,postgres,qdrant,reddit,remote,s3,salesforce,sftp,sharepoint,singlestore,slack,vectara,vertexai,voyageai,weaviate,wikipedia]>=1.4.0, <2.0.0; platform_system != 'Windows'",
"unstructured-ingest[airtable,astradb,azure,azure-ai-search,bedrock,biomed,box,chroma,confluence,couchbase,databricks-volumes,delta-table,discord,dropbox,elasticsearch,gcs,github,gitlab,google-drive,hubspot,huggingface,jira,kafka,kdbai,milvus,mongodb,notion,octoai,onedrive,openai,opensearch,outlook,pinecone,postgres,qdrant,reddit,remote,s3,salesforce,sftp,sharepoint,singlestore,slack,vectara,vertexai,voyageai,weaviate,wikipedia]>=1.4.0, <2.0.0; platform_system == 'Windows' and python_version < '3.13'",
"unstructured-ingest[airtable,astradb,azure,azure-ai-search,bedrock,biomed,box,chroma,confluence,couchbase,databricks-volumes,delta-table,discord,dropbox,elasticsearch,gcs,github,gitlab,google-drive,hubspot,huggingface,jira,kafka,kdbai,milvus,mongodb,notion,octoai,onedrive,openai,opensearch,outlook,pinecone,postgres,qdrant,reddit,remote,s3,salesforce,sftp,sharepoint,singlestore,slack,vectara,vertexai,voyageai,weaviate,wikipedia]>=1.4.0, <2.0.0; platform_system != 'Windows' and python_version <= '3.13'",
"unstructured-ingest[airtable,astradb,azure,azure-ai-search,bedrock,biomed,box,chroma,confluence,couchbase,databricks-volumes,delta-table,discord,dropbox,elasticsearch,gcs,github,gitlab,google-drive,hubspot,huggingface,jira,kafka,kdbai,milvus,mongodb,notion,octoai,onedrive,openai,opensearch,outlook,pinecone,postgres,qdrant,reddit,remote,s3,salesforce,sftp,sharepoint,singlestore,slack,vectara,vertexai,voyageai,weaviate,wikipedia] @ git+https://github.com/FomalhautWeisszwerg/unstructured-ingest@63875035d3ec912e91081e73b0c13527f4ee2e05 ; platform_system != 'Windows' and python_version >= '3.14'",
"unstructured-ingest[airtable,astradb,azure,azure-ai-search,bedrock,biomed,box,chroma,confluence,couchbase,databricks-volumes,delta-table,discord,dropbox,elasticsearch,gcs,github,gitlab,google-drive,hubspot,huggingface,jira,kafka,kdbai,milvus,mongodb,notion,octoai,onedrive,openai,opensearch,outlook,pinecone,postgres,qdrant,reddit,remote,s3,salesforce,sftp,sharepoint,singlestore,slack,vectara,vertexai,voyageai,weaviate,wikipedia]>=1.4.0, <2.0.0; platform_system == 'Windows' and python_version < '3.14'",
]

[project.urls]
Expand All @@ -158,6 +160,9 @@ packages = ["/unstructured"]
[tool.hatch.build.targets.sdist]
packages = ["/unstructured"]

[tool.hatch.metadata]
allow-direct-references = true

[dependency-groups]
test = [
"coverage>=7.13.3, <8.0.0",
Expand All @@ -182,10 +187,10 @@ dev = [
"pre-commit>=4.5.1, <5.0.0",
]
lint = [
"ruff>=0.15.0, <1.0.0",
"ruff>=0.15.8, <1.0.0",
]
release = [
"twine>=6.0.0, <7.0.0",
"twine>=6.2.0, <7.0.0",
]

[tool.uv]
Expand Down Expand Up @@ -216,7 +221,7 @@ constraint-dependencies = [

[tool.pyright]
pythonPlatform = "Linux"
pythonVersion = "3.11"
pythonVersion = "3.12"
reportUnnecessaryCast = true
reportUnnecessaryTypeIgnoreComment = true
stubPath = "./typings"
Expand All @@ -225,7 +230,7 @@ verboseOutput = true

[tool.ruff]
line-length = 100
target-version = "py311"
target-version = "py312"

[tool.ruff.lint]
ignore = [
Expand Down
Loading