Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
855 changes: 822 additions & 33 deletions package-lock.json

Large diffs are not rendered by default.

11 changes: 11 additions & 0 deletions packages/dataset/src/distribution.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,17 @@ export class Distribution {
*/
public readonly mimeType?: string;

/**
* Plain content type derived from {@link compressFormat}, e.g. `application/gzip`.
* Returns `undefined` when no compression format is declared.
*/
public get compressMimeType(): string | undefined {
if (this.compressFormat === undefined) return undefined;
return this.compressFormat.startsWith(IANA_MEDIA_TYPE_PREFIX)
? this.compressFormat.slice(IANA_MEDIA_TYPE_PREFIX.length)
: this.compressFormat;
}

/**
* @param accessUrl Distribution access URL.
* @param mediaType IANA media type URI per DCAT-AP 3.0
Expand Down
13 changes: 13 additions & 0 deletions packages/dataset/src/mediaType.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,16 @@ export const rdfMediaTypes = [
iana('application/n-triples'),
iana('text/turtle'),
];

/**
* Plain content types that indicate compression of the body rather than the
* RDF serialization itself. Consumers use this to ignore an HTTP Content-Type
* that just means "bytes were gzipped/zipped" when matching against a declared
* RDF media type.
*/
export const compressionMediaTypes: ReadonlySet<string> = new Set([
'application/gzip',
'application/x-gzip',
'application/zip',
'application/octet-stream',
]);
11 changes: 2 additions & 9 deletions packages/distribution-probe/src/probe.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { Distribution } from '@lde/dataset';
import { compressionMediaTypes, Distribution } from '@lde/dataset';
import { Parser } from 'n3';

/**
Expand Down Expand Up @@ -392,13 +392,6 @@ function validateBody(body: string, contentType: string | null): string | null {
return null;
}

/** Content types that indicate compression, not the RDF serialization format. */
const compressionTypes = new Set([
'application/gzip',
'application/x-gzip',
'application/octet-stream',
]);

/**
* Compare the declared MIME type from the dataset registry against the
* server's Content-Type header. Adds a warning when they disagree.
Expand All @@ -410,7 +403,7 @@ function checkContentTypeMismatch(
if (!result.isSuccess() || !declaredMimeType || !result.contentType) return;

const actual = result.contentType.split(';')[0].trim();
if (compressionTypes.has(actual)) return;
if (compressionMediaTypes.has(actual)) return;

if (actual !== declaredMimeType) {
result.warnings.push(
Expand Down
4 changes: 2 additions & 2 deletions packages/distribution-probe/vite.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ export default mergeConfig(
coverage: {
thresholds: {
autoUpdate: true,
lines: 99.16,
lines: 99.15,
functions: 100,
branches: 86.36,
statements: 98.37,
statements: 98.36,
},
},
},
Expand Down
9 changes: 8 additions & 1 deletion packages/sparql-qlever/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,13 @@
"@lde/task-runner-docker": "0.2.13",
"@lde/task-runner-native": "0.2.14",
"@lde/wait-for-sparql": "0.2.12",
"tslib": "^2.3.0"
"rdf-parse": "^5.0.0",
"rdf-serialize": "^5.1.0",
"tslib": "^2.3.0",
"yauzl": "^3.2.0"
},
"devDependencies": {
"@rdfjs/types": "^2.0.0",
"@types/yauzl": "^2.10.3"
}
}
89 changes: 62 additions & 27 deletions packages/sparql-qlever/src/importer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@ import {
ImportSuccessful,
NotSupported,
} from '@lde/sparql-importer';
import { Distribution } from '@lde/dataset';
import { compressionMediaTypes, Distribution } from '@lde/dataset';
import { LastModifiedDownloader } from '@lde/distribution-downloader';
import { basename, dirname, join } from 'path';
import { readFile, stat, writeFile } from 'node:fs/promises';
import { needsPreprocessing, preprocess } from './preprocess.js';

export interface QleverIndexOptions {
/** @default true */
Expand Down Expand Up @@ -57,11 +58,17 @@ export class Importer implements ImporterInterface {
public async import(
distributions: Distribution[],
): Promise<NotSupported | ImportSuccessful | ImportFailed> {
const downloadDistributions = distributions.filter(
(distribution): distribution is Distribution & { mimeType: string } =>
distribution.mimeType !== undefined &&
supportedFormats.has(distribution.mimeType),
);
const downloadDistributions = distributions
.filter(
(distribution): distribution is Distribution & { mimeType: string } =>
distribution.mimeType !== undefined &&
acceptedMediaTypes.includes(distribution.mimeType),
)
.sort(
(a, b) =>
acceptedMediaTypes.indexOf(a.mimeType) -
acceptedMediaTypes.indexOf(b.mimeType),
);
if (downloadDistributions.length === 0) {
return new NotSupported();
}
Expand Down Expand Up @@ -104,20 +111,32 @@ export class Importer implements ImporterInterface {
return new ImportSuccessful(distribution, undefined, tripleCount);
}

const { format, warning } = fileFormatFor(
distribution.mimeType,
basename(localFile),
headers.get('Content-Type') ?? undefined,
);
let indexFile = localFile;
let format: fileFormat;
const warnings: string[] = [];
if (needsPreprocessing(distribution)) {
const result = await preprocess(localFile, distribution);
indexFile = result.path;
format = result.format;
warnings.push(...result.warnings);
} else {
const resolved = fileFormatFor(
distribution.mimeType,
basename(localFile),
headers.get('Content-Type') ?? undefined,
);
format = resolved.format;
if (resolved.warning) warnings.push(resolved.warning);
}
let logs: string;
try {
logs = await this.index(localFile, format);
logs = await this.index(indexFile, format);
} catch (error) {
if (
format === 'ttl' &&
(error as Error).message?.includes('multiline string literal')
) {
logs = await this.index(localFile, format, false);
logs = await this.index(indexFile, format, false);
} else {
throw error;
}
Expand All @@ -133,7 +152,6 @@ export class Importer implements ImporterInterface {

await this.writeCacheInfo(localFile);

const warnings = warning ? [warning] : [];
return new ImportSuccessful(distribution, undefined, tripleCount, warnings);
}

Expand Down Expand Up @@ -232,23 +250,46 @@ export class Importer implements ImporterInterface {
.join(' ');

const metadataFile = `${this.options.indexName}.meta-data.json`;
const localName = basename(file);
const decompressCommand = localName.toLowerCase().endsWith('.zip')
? `unzip -p '${localName}'`
: `(gunzip -c '${localName}' 2>/dev/null || cat '${localName}')`;
const indexTask = await this.options.taskRunner.run(
`(gunzip -c '${basename(file)}' 2>/dev/null || cat '${basename(
file,
)}') | qlever-index ${flags} && cat ${metadataFile}`,
`${decompressCommand} | qlever-index ${flags} && cat ${metadataFile}`,
);
return await this.options.taskRunner.wait(indexTask);
}
}

type fileFormat = 'nt' | 'nq' | 'ttl';

const supportedFormats = new Map<string, fileFormat>([
/**
* Native QLever index formats — `qlever-index -F <flag>` consumes these
* directly. JSON-LD is not here: it is preprocessed to N-Quads first (see
* {@link preprocess}).
*/
const nativeFormats = new Map<string, fileFormat>([
['application/n-triples', 'nt'],
['application/n-quads', 'nq'],
['text/turtle', 'ttl'],
]);

/**
* Accepted distribution media types, in preference order: the first match is
* tried first. Native formats win over JSON-LD because they skip the Node-side
* preprocessor.
*
* `application/zip` is intentionally absent — the inner RDF format must be
* declared via `mediaType` with `application/zip` appearing only as the
* `compressFormat`, so we know what is inside.
*/
const acceptedMediaTypes: readonly string[] = [
'application/n-quads',
'application/n-triples',
'text/turtle',
'application/ld+json',
];

const defaultQleverIndexOptions = {
'ascii-prefixes-only': true,
'num-triples-per-batch': 3_000_000,
Expand All @@ -272,12 +313,6 @@ interface ResolvedFormat {
warning?: string;
}

const compressionTypes = new Set([
'application/gzip',
'application/x-gzip',
'application/octet-stream',
]);

/**
* Determine the QLever format flag for a distribution.
*
Expand All @@ -291,16 +326,16 @@ function fileFormatFor(
filename: string,
serverContentType?: string,
): ResolvedFormat {
const declaredFormat = supportedFormats.get(declaredMimeType);
const declaredFormat = nativeFormats.get(declaredMimeType);
if (declaredFormat === undefined) {
throw new Error(`Unsupported media type: ${declaredMimeType}`);
}

// Try server Content-Type first (strip parameters like "; charset=utf-8").
if (serverContentType) {
const actualType = serverContentType.split(';')[0].trim();
if (!compressionTypes.has(actualType)) {
const serverFormat = supportedFormats.get(actualType);
if (!compressionMediaTypes.has(actualType)) {
const serverFormat = nativeFormats.get(actualType);
if (serverFormat !== undefined && serverFormat !== declaredFormat) {
return {
format: serverFormat,
Expand Down
Loading