VEuPathDB · steve-fischer-200 · Feb 2, 2026 · Jan 24, 2026 · Jan 24, 2026 · Jan 25, 2026
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,6 @@ target/
 .settings/
 *~
 \#*
+env.sh
+work/
+
diff --git a/config/env.sh-tmpl b/config/env.sh-tmpl
@@ -0,0 +1,31 @@
+#!/bin/bash
+# Template for environment variables needed by SiteSearchData nextflow workflow
+# Copy this file to env.sh and fill in the values
+# Then run: source env.sh
+
+# Application Database Configuration
+APPDB_LDAP_SERVER=sa.plasmodb.org:1521
+APPDB_LDAP_BASE_DN=ou=applications,dc=apidb,dc=org
+APPDB_LOGIN=
+APPDB_PASSWORD=
+APPDB_REMOTE_USER_SCHEMA=
+APPDB_HOST=
+APPDB_PORT=5432
+
+# User Database Configuration
+USERDB_LDAP_SERVER=sa.plasmodb.org:1521
+USERDB_LDAP_BASE_DN=ou=applications,dc=apidb,dc=org
+USERDB_LDAP_CN=acctdbprod
+USERDB_LOGIN=
+USERDB_PASSWORD=
+USERDB_USER_SCHEMA=
+
+# OAuth Configuration (can be left empty if not using OAuth)
+OAUTH_URL=
+OAUTH_CLIENT_ID=
+OAUTH_CLIENT_SECRET=
+
+# Model Properties
+VDI_CONTROL_SCHEMA=
+REMOTE_COMMENT_SCHEMA=
+
diff --git a/config/nextflow.config-tmpl b/config/nextflow.config-tmpl
@@ -0,0 +1,15 @@
+params {
+      outputDir = ""
+      envFile = ""
+      dbName = ""
+      numberOfOrganisms =
+}
+process {
+      container = 'veupathdb/site-search-data:latest'
+      maxForks = 4
+}
+podman {
+      enabled = true
+      runOptions = '--cgroups=disabled'
+}
+
diff --git a/main.nf b/main.nf
@@ -9,10 +9,193 @@ if(!params.outputDir) {
   throw new Exception("Missing params.outputDir")
 }
 
+if(!params.envFile) {
+  throw new Exception("Missing params.envFile")
+}
+
+if(!params.numberOfOrganisms) {
+  throw new Exception("Missing params.numberOfOrganisms")
+}
+
 //--------------------------------------------------------------------------
 // Main Workflow
 //--------------------------------------------------------------------------
 
 workflow {
-  // Workflow processes will be defined here
+  // Define projects grouped by cohort
+  // Each tuple is: [cohort, projectId]
+  projects = Channel.of(
+    ['ApiCommon', 'PlasmoDB'],
+    ['ApiCommon', 'ToxoDB'],
+    ['ApiCommon', 'HostDB'],
+    ['ApiCommon', 'AmoebaDB'],
+    ['ApiCommon', 'CryptoDB'],
+    ['ApiCommon', 'FungiDB'],
+    ['ApiCommon', 'GiardiaDB'],
+    ['ApiCommon', 'MicrosporidiaDB'],
+    ['ApiCommon', 'PiroplasmaDB'],
+    ['ApiCommon', 'TriTrypDB'],
+    ['ApiCommon', 'TrichDB'],
+    ['ApiCommon', 'VectorBase'],
+    ['OrthoMCL', 'OrthoMCL']
+//    ['EDA', 'ClinEpiDB'],
+//    ['EDA', 'MicrobiomeDB']
+  )
+
+  // Use a representative project from each cohort for config generation
+  metadataCohorts = Channel.of(
+    ['ApiCommon', 'PlasmoDB'],   // Use PlasmoDB as representative for ApiCommon
+    ['OrthoMCL', 'OrthoMCL'],
+    ['EDA', 'PlasmoDB']         // Use PlasmoDB as representative for EDA
+  )
+
+  // Recreate WDK cache (runs once at the start)
+  recreateCache(params.envFile)
+
+  // Create metadata batches for each cohort (runs in parallel with project dumps)
+  createMetadataBatches(metadataCohorts, params.envFile)
+
+  // Run the workflow for each project
+  results = runSiteSearchData(projects, params.envFile)
+}
+
+process recreateCache {
+  containerOptions "--env-file ${params.envFile} -e COHORT=ApiCommon -e PROJECT_ID=PlasmoDB"
+
+  input:
+    path(envFile)
+
+  output:
+    val true
+
+  script:
+  """
+  echo "Recreating WDK cache..."
+  wdkCache -model SiteSearchData -recreate
+  echo "WDK cache recreated successfully"
+  """
+}
+
+process createMetadataBatches {
+  containerOptions "-v ${params.outputDir}:/output --env-file ${params.envFile} -e COHORT=${cohort} -e PROJECT_ID=${projectId}"
+
+  input:
+    tuple val(cohort), val(projectId)
+    path(envFile)
+
+  output:
+    val cohort
+
+  script:
+  // Use ports 8900+ for metadata servers (separate from project dump ports 9000+)
+  // task.index assigns unique port per parallel execution slot
+  def port = 8900 + task.index
+  """
+  mkdir -p /output/metadata/${cohort}
+
+  # Start WDK server on dedicated port in the background
+  wdkServer SiteSearchData http://0.0.0.0:${port} &> /output/metadata/${cohort}/server.log &
+  SERVER_PID=\$!
+
+  # Wait for server to be ready
+  echo "Waiting for WDK server to start on port ${port} for ${cohort} metadata..."
+  for i in {1..60}; do
+    HTTP_CODE=\$(curl -s -o /dev/null -w "%{http_code}" http://localhost:${port} || echo "000")
+    echo "Attempt \$i: HTTP_CODE=\$HTTP_CODE"
+    if [ "\$HTTP_CODE" -ge 200 ] && [ "\$HTTP_CODE" -lt 300 ]; then
+      echo "Server is ready on port ${port}"
+      break
+    elif [ "\$HTTP_CODE" -ge 400 ] && [ "\$HTTP_CODE" -lt 600 ]; then
+      echo "Server returned error \$HTTP_CODE on port ${port}"
+      exit 1
+    fi
+    sleep 2
+  done
+
+  # Create document categories batch if not already complete
+  CAT_BATCH=\$(ls -d /output/metadata/${cohort}/solr-json-batch_document-categories_all_* 2>/dev/null | tail -1)
+  if [ -n "\$CAT_BATCH" ] && [ -f "\$CAT_BATCH/DONE" ]; then
+    echo "Document categories batch already exists and is complete for ${cohort}, skipping"
+  else
+    echo "Creating document categories batch for ${cohort}"
+    ssCreateDocumentCategoriesBatch ${cohort} /output/metadata/${cohort} &> /output/metadata/${cohort}/docCat.log
+  fi
+
+  # Create document fields batch if not already complete
+  FIELD_BATCH=\$(ls -d /output/metadata/${cohort}/solr-json-batch_document-fields_all_* 2>/dev/null | tail -1)
+  if [ -n "\$FIELD_BATCH" ] && [ -f "\$FIELD_BATCH/DONE" ]; then
+    echo "Document fields batch already exists and is complete for ${cohort}, skipping"
+  else
+    echo "Creating document fields batch for ${cohort}"
+    ssCreateDocumentFieldsBatch http://localhost:${port} ${cohort} /output/metadata/${cohort} &> /output/metadata/${cohort}/docField.log
+  fi
+
+  # Stop the server
+  kill \$SERVER_PID || true
+  """
+}
+
+process runSiteSearchData {
+  containerOptions "-v ${params.outputDir}:/output --env-file ${params.envFile} -e COHORT=${cohort} -e PROJECT_ID=${projectId}"
+
+  input:
+    tuple val(cohort), val(projectId)
+    path(envFile)
+
+  output:
+    val projectId
+
+  script:
+  // Assign port based on parallel execution slot (task.index ranges from 0 to maxForks-1)
+  def port = 9000 + task.index
+
+  // Select appropriate dump script and arguments based on cohort
+  def dumpScript
+  def dumpArgs
+  if (cohort == 'ApiCommon') {
+    dumpScript = "dumpApiCommonWdkBatchesForSolr"
+    dumpArgs = "--wdkServiceUrl \"http://localhost:${port}\" --targetDir /output/${projectId} --numberOfOrganisms ${params.numberOfOrganisms}"
+  } else if (cohort == 'OrthoMCL') {
+    dumpScript = "dumpOrthomclWdkBatchesForSolr"
+    dumpArgs = "--wdkServiceUrl \"http://localhost:${port}\" --targetDir /output/${projectId}"
+  } else if (cohort == 'EDA') {
+    dumpScript = "dumpEdaWdkBatchesForSolr"
+    dumpArgs = "--wdkServiceUrl \"http://localhost:${port}\" --targetDir /output/${projectId}"
+  }
+
+  """
+  mkdir -p /output/${projectId}
+
+  # Start WDK server on dedicated port in the background, logging to output dir
+  wdkServer SiteSearchData http://0.0.0.0:${port}  &> /output/${projectId}/server.log &
+  SERVER_PID=\$!
+
+  # Wait for server to be ready
+  echo "Waiting for WDK server to start on port ${port}..."
+  for i in {1..60}; do
+    HTTP_CODE=\$(curl -s -o /dev/null -w "%{http_code}" http://localhost:${port} || echo "000")
+    echo "Attempt \$i: HTTP_CODE=\$HTTP_CODE"
+    if [ "\$HTTP_CODE" -ge 200 ] && [ "\$HTTP_CODE" -lt 300 ]; then
+      echo "Server is ready on port ${port}"
+      break
+    elif [ "\$HTTP_CODE" -ge 400 ] && [ "\$HTTP_CODE" -lt 600 ]; then
+      echo "Server returned error \$HTTP_CODE on port ${port}"
+      exit 1
+    fi
+    sleep 2
+  done
+
+  # Run the appropriate dump script(s) based on cohort
+  echo "Running ${dumpScript} for ${cohort} cohort, project ${projectId}"
+  ${dumpScript} ${dumpArgs} &>> /output/${projectId}/dumper.log
+
+  # For ApiCommon, also run EDA dump script
+  if [ "${cohort}" = "ApiCommon" ]; then
+    echo "Running dumpEdaWdkBatchesForSolr for ${cohort} cohort, project ${projectId}"
+    dumpEdaWdkBatchesForSolr --wdkServiceUrl "http://localhost:${port}" --targetDir /output/${projectId} &>> /output/${projectId}/dumper.log
+  fi
+
+  # Stop the server
+  kill \$SERVER_PID || true
+  """
 }
diff --git a/nextflow.config b/nextflow.config
@@ -1,14 +1,13 @@
-profiles {
-  default {
-    params {
-      outputDir = "$launchDir/output"
-    }
-    process {
-      container = 'veupathdb/site-search-data:1.2.0'
-      maxForks = 4
-    }
-    docker {
-      enabled = true
-    }
-  }
+params {
+  envFile = "$launchDir/env.sh"
+}
+
+process {
+  maxForks = 4
+  container = 'docker.io/veupathdb/site-search-data:latest'
+}
+
+podman {
+  enabled = true
+  runOptions = '--cgroups=disabled --security-opt label=disable'
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,3 +4,6 @@ target/ @@
     .settings/
     *~
     \#*
+    env.sh
+    work/