Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@ target/
.settings/
*~
\#*
env.sh
work/

31 changes: 31 additions & 0 deletions config/env.sh-tmpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/bin/bash
# Template for environment variables needed by SiteSearchData nextflow workflow
# Copy this file to env.sh and fill in the values
# Then run: source env.sh

# Application Database Configuration
APPDB_LDAP_SERVER=sa.plasmodb.org:1521
APPDB_LDAP_BASE_DN=ou=applications,dc=apidb,dc=org
APPDB_LOGIN=
APPDB_PASSWORD=
APPDB_REMOTE_USER_SCHEMA=
APPDB_HOST=
APPDB_PORT=5432

# User Database Configuration
USERDB_LDAP_SERVER=sa.plasmodb.org:1521
USERDB_LDAP_BASE_DN=ou=applications,dc=apidb,dc=org
USERDB_LDAP_CN=acctdbprod
USERDB_LOGIN=
USERDB_PASSWORD=
USERDB_USER_SCHEMA=

# OAuth Configuration (can be left empty if not using OAuth)
OAUTH_URL=
OAUTH_CLIENT_ID=
OAUTH_CLIENT_SECRET=

# Model Properties
VDI_CONTROL_SCHEMA=
REMOTE_COMMENT_SCHEMA=

15 changes: 15 additions & 0 deletions config/nextflow.config-tmpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
params {
outputDir = ""
envFile = ""
dbName = ""
numberOfOrganisms =
}
process {
container = 'veupathdb/site-search-data:latest'
maxForks = 4
}
podman {
enabled = true
runOptions = '--cgroups=disabled'
}

185 changes: 184 additions & 1 deletion main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,193 @@ if(!params.outputDir) {
throw new Exception("Missing params.outputDir")
}

if(!params.envFile) {
throw new Exception("Missing params.envFile")
}

if(!params.numberOfOrganisms) {
throw new Exception("Missing params.numberOfOrganisms")
}

//--------------------------------------------------------------------------
// Main Workflow
//--------------------------------------------------------------------------

workflow {
// Workflow processes will be defined here
// Define projects grouped by cohort
// Each tuple is: [cohort, projectId]
projects = Channel.of(
['ApiCommon', 'PlasmoDB'],
['ApiCommon', 'ToxoDB'],
['ApiCommon', 'HostDB'],
['ApiCommon', 'AmoebaDB'],
['ApiCommon', 'CryptoDB'],
['ApiCommon', 'FungiDB'],
['ApiCommon', 'GiardiaDB'],
['ApiCommon', 'MicrosporidiaDB'],
['ApiCommon', 'PiroplasmaDB'],
['ApiCommon', 'TriTrypDB'],
['ApiCommon', 'TrichDB'],
['ApiCommon', 'VectorBase'],
['OrthoMCL', 'OrthoMCL']
// ['EDA', 'ClinEpiDB'],
// ['EDA', 'MicrobiomeDB']
)

// Use a representative project from each cohort for config generation
metadataCohorts = Channel.of(
['ApiCommon', 'PlasmoDB'], // Use PlasmoDB as representative for ApiCommon
['OrthoMCL', 'OrthoMCL'],
['EDA', 'PlasmoDB'] // Use PlasmoDB as representative for EDA
)

// Recreate WDK cache (runs once at the start)
recreateCache(params.envFile)

// Create metadata batches for each cohort (runs in parallel with project dumps)
createMetadataBatches(metadataCohorts, params.envFile)

// Run the workflow for each project
results = runSiteSearchData(projects, params.envFile)
}

process recreateCache {
containerOptions "--env-file ${params.envFile} -e COHORT=ApiCommon -e PROJECT_ID=PlasmoDB"

input:
path(envFile)

output:
val true

script:
"""
echo "Recreating WDK cache..."
wdkCache -model SiteSearchData -recreate
echo "WDK cache recreated successfully"
"""
}

process createMetadataBatches {
containerOptions "-v ${params.outputDir}:/output --env-file ${params.envFile} -e COHORT=${cohort} -e PROJECT_ID=${projectId}"

input:
tuple val(cohort), val(projectId)
path(envFile)

output:
val cohort

script:
// Use ports 8900+ for metadata servers (separate from project dump ports 9000+)
// task.index assigns unique port per parallel execution slot
def port = 8900 + task.index
"""
mkdir -p /output/metadata/${cohort}

# Start WDK server on dedicated port in the background
wdkServer SiteSearchData http://0.0.0.0:${port} &> /output/metadata/${cohort}/server.log &
SERVER_PID=\$!

# Wait for server to be ready
echo "Waiting for WDK server to start on port ${port} for ${cohort} metadata..."
for i in {1..60}; do
HTTP_CODE=\$(curl -s -o /dev/null -w "%{http_code}" http://localhost:${port} || echo "000")
echo "Attempt \$i: HTTP_CODE=\$HTTP_CODE"
if [ "\$HTTP_CODE" -ge 200 ] && [ "\$HTTP_CODE" -lt 300 ]; then
echo "Server is ready on port ${port}"
break
elif [ "\$HTTP_CODE" -ge 400 ] && [ "\$HTTP_CODE" -lt 600 ]; then
echo "Server returned error \$HTTP_CODE on port ${port}"
exit 1
fi
sleep 2
done

# Create document categories batch if not already complete
CAT_BATCH=\$(ls -d /output/metadata/${cohort}/solr-json-batch_document-categories_all_* 2>/dev/null | tail -1)
if [ -n "\$CAT_BATCH" ] && [ -f "\$CAT_BATCH/DONE" ]; then
echo "Document categories batch already exists and is complete for ${cohort}, skipping"
else
echo "Creating document categories batch for ${cohort}"
ssCreateDocumentCategoriesBatch ${cohort} /output/metadata/${cohort} &> /output/metadata/${cohort}/docCat.log
fi

# Create document fields batch if not already complete
FIELD_BATCH=\$(ls -d /output/metadata/${cohort}/solr-json-batch_document-fields_all_* 2>/dev/null | tail -1)
if [ -n "\$FIELD_BATCH" ] && [ -f "\$FIELD_BATCH/DONE" ]; then
echo "Document fields batch already exists and is complete for ${cohort}, skipping"
else
echo "Creating document fields batch for ${cohort}"
ssCreateDocumentFieldsBatch http://localhost:${port} ${cohort} /output/metadata/${cohort} &> /output/metadata/${cohort}/docField.log
fi

# Stop the server
kill \$SERVER_PID || true
"""
}

process runSiteSearchData {
containerOptions "-v ${params.outputDir}:/output --env-file ${params.envFile} -e COHORT=${cohort} -e PROJECT_ID=${projectId}"

input:
tuple val(cohort), val(projectId)
path(envFile)

output:
val projectId

script:
// Assign port based on parallel execution slot (task.index ranges from 0 to maxForks-1)
def port = 9000 + task.index

// Select appropriate dump script and arguments based on cohort
def dumpScript
def dumpArgs
if (cohort == 'ApiCommon') {
dumpScript = "dumpApiCommonWdkBatchesForSolr"
dumpArgs = "--wdkServiceUrl \"http://localhost:${port}\" --targetDir /output/${projectId} --numberOfOrganisms ${params.numberOfOrganisms}"
} else if (cohort == 'OrthoMCL') {
dumpScript = "dumpOrthomclWdkBatchesForSolr"
dumpArgs = "--wdkServiceUrl \"http://localhost:${port}\" --targetDir /output/${projectId}"
} else if (cohort == 'EDA') {
dumpScript = "dumpEdaWdkBatchesForSolr"
dumpArgs = "--wdkServiceUrl \"http://localhost:${port}\" --targetDir /output/${projectId}"
}

"""
mkdir -p /output/${projectId}

# Start WDK server on dedicated port in the background, logging to output dir
wdkServer SiteSearchData http://0.0.0.0:${port} &> /output/${projectId}/server.log &
SERVER_PID=\$!

# Wait for server to be ready
echo "Waiting for WDK server to start on port ${port}..."
for i in {1..60}; do
HTTP_CODE=\$(curl -s -o /dev/null -w "%{http_code}" http://localhost:${port} || echo "000")
echo "Attempt \$i: HTTP_CODE=\$HTTP_CODE"
if [ "\$HTTP_CODE" -ge 200 ] && [ "\$HTTP_CODE" -lt 300 ]; then
echo "Server is ready on port ${port}"
break
elif [ "\$HTTP_CODE" -ge 400 ] && [ "\$HTTP_CODE" -lt 600 ]; then
echo "Server returned error \$HTTP_CODE on port ${port}"
exit 1
fi
sleep 2
done

# Run the appropriate dump script(s) based on cohort
echo "Running ${dumpScript} for ${cohort} cohort, project ${projectId}"
${dumpScript} ${dumpArgs} &>> /output/${projectId}/dumper.log

# For ApiCommon, also run EDA dump script
if [ "${cohort}" = "ApiCommon" ]; then
echo "Running dumpEdaWdkBatchesForSolr for ${cohort} cohort, project ${projectId}"
dumpEdaWdkBatchesForSolr --wdkServiceUrl "http://localhost:${port}" --targetDir /output/${projectId} &>> /output/${projectId}/dumper.log
fi

# Stop the server
kill \$SERVER_PID || true
"""
}
25 changes: 12 additions & 13 deletions nextflow.config
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
profiles {
default {
params {
outputDir = "$launchDir/output"
}
process {
container = 'veupathdb/site-search-data:1.2.0'
maxForks = 4
}
docker {
enabled = true
}
}
params {
envFile = "$launchDir/env.sh"
}

process {
maxForks = 4
container = 'docker.io/veupathdb/site-search-data:latest'
}

podman {
enabled = true
runOptions = '--cgroups=disabled --security-opt label=disable'
}