Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
3e14599
**chore(variants): Remove unused legacy variant group and alias code**
JamesKane Dec 13, 2025
5512eb6
Template for maintenance page
JamesKane Dec 13, 2025
9287d30
DU Naming Authority API
JamesKane Dec 13, 2025
d292b68
**refactor(variants): Update forms and backend for simplified referen…
JamesKane Dec 13, 2025
a89c332
**chore(curator): Remove legacy Cytoband and STR Marker templates and…
JamesKane Dec 13, 2025
8538d45
**chore(curator): Remove legacy Cytoband and STR Marker templates and…
JamesKane Dec 13, 2025
5031816
**refactor(views/utils): Centralize badge and variant rendering utili…
JamesKane Dec 13, 2025
87350f5
**refactor(views): Extract and reuse breadcrumb, flash message, and s…
JamesKane Dec 13, 2025
e885f5e
**refactor(views): Extract and reuse breadcrumb, flash message, and s…
JamesKane Dec 13, 2025
cd3a139
feat(tree): add user opt-in preference for block layout
JamesKane Dec 14, 2025
24c3c6f
**refactor(sql): Optimize variant migration script for performance an…
JamesKane Dec 14, 2025
2b758c9
**refactor(views/controllers): Simplify variant and haplogroup pagina…
JamesKane Dec 14, 2025
d989469
**chore(models): Remove deprecated STR Marker model and table**
JamesKane Dec 14, 2025
00eace1
**feat(variants): Add smart ingestion and deduplication for YBrowse v…
JamesKane Dec 14, 2025
a4fe11e
**refactor(sql): Optimize and consolidate variant migration script fo…
JamesKane Dec 14, 2025
98fa0c6
Fixing some SQL issues
JamesKane Dec 14, 2025
81e81a9
Enhance the block tree layout to use SVG and similar design language …
JamesKane Dec 14, 2025
0ba9324
Search the tree by SNP name as well as subclade name
JamesKane Dec 14, 2025
958defb
Using the more correct terminology for cladograms
JamesKane Dec 14, 2025
8224afd
Fixing the download location. Was still trying to refer to the VCF.
JamesKane Dec 14, 2025
129b357
Reducing the number of connections from the pool we allow the variant…
JamesKane Dec 14, 2025
de0076f
Fixing some UI odditities
JamesKane Dec 14, 2025
735d74e
Remove dead code
JamesKane Dec 14, 2025
6cd4f83
Split the DB side OR to live in the application code. It was causing…
JamesKane Dec 14, 2025
36445b4
Try optimizing the queries.
JamesKane Dec 14, 2025
986c66c
Comment out the logging to reduce noise. Need to rethink it so we st…
JamesKane Dec 14, 2025
01afbd9
Back to logging total records examined, since the Perf issues seems r…
JamesKane Dec 14, 2025
3d5b910
Split the universal variant details into smaller chunks retaining the…
JamesKane Dec 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
210 changes: 55 additions & 155 deletions app/actors/YBrowseVariantUpdateActor.scala
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,9 @@ import org.apache.pekko.actor.Actor
import play.api.Logging
import services.genomics.YBrowseVariantIngestionService

import java.io.{BufferedInputStream, BufferedReader, FileOutputStream, InputStreamReader}
import java.io.{BufferedInputStream, FileOutputStream}
import java.net.{HttpURLConnection, URI}
import java.nio.file.Files
import java.util.zip.{GZIPInputStream, GZIPOutputStream}
import scala.concurrent.{ExecutionContext, Future}
import scala.util.{Failure, Success, Try}

Expand Down Expand Up @@ -68,180 +67,81 @@ class YBrowseVariantUpdateActor @javax.inject.Inject()(

private def runUpdate(): Future[UpdateResult] = {
Future {
downloadVcfFile()
downloadGffFile()
}.flatMap {
case Success(_) =>
logger.info("VCF file downloaded successfully, sanitizing VCF")
Future(sanitizeVcfFile()).flatMap {
case Success(skipped) =>
logger.info(s"VCF sanitized (removed $skipped malformed records), starting ingestion")
ingestionService.ingestVcf(genomicsConfig.ybrowseVcfStoragePath).map { count =>
UpdateResult(success = true, variantsIngested = count, s"Successfully ingested $count variants (skipped $skipped malformed records)")
}
case Failure(ex) =>
Future.successful(UpdateResult(success = false, variantsIngested = 0, s"Sanitization failed: ${ex.getMessage}"))
logger.info("GFF file downloaded successfully, starting ingestion")
ingestionService.ingestGff(genomicsConfig.ybrowseGffStoragePath).map { count =>
UpdateResult(success = true, variantsIngested = count, s"Successfully ingested $count variants from GFF")
}
case Failure(ex) =>
Future.successful(UpdateResult(success = false, variantsIngested = 0, s"Download failed: ${ex.getMessage}"))
}
}

private def downloadVcfFile(): Try[Unit] = Try {
val url = URI.create(genomicsConfig.ybrowseVcfUrl).toURL
val targetFile = genomicsConfig.ybrowseVcfStoragePath

// Ensure parent directory exists
val parentDir = targetFile.getParentFile
if (parentDir != null && !parentDir.exists()) {
Files.createDirectories(parentDir.toPath)
logger.info(s"Created directory: ${parentDir.getAbsolutePath}")
}

// Download to a temp file first, then rename (atomic operation)
val tempFile = new java.io.File(targetFile.getAbsolutePath + ".tmp")

logger.info(s"Downloading VCF from ${genomicsConfig.ybrowseVcfUrl} to ${tempFile.getAbsolutePath}")
private def downloadGffFile(): Try[Unit] = Try {
val url = URI.create(genomicsConfig.ybrowseGffUrl).toURL
val targetFile = genomicsConfig.ybrowseGffStoragePath

// Check for fresh local file (cache for 24 hours)
val cacheDuration = 24 * 60 * 60 * 1000L // 24 hours in millis
if (targetFile.exists() && (System.currentTimeMillis() - targetFile.lastModified() < cacheDuration)) {
logger.info(s"Local GFF file is fresh (< 24 hours old), skipping download: ${targetFile.getAbsolutePath}")
} else {
// Ensure parent directory exists
val parentDir = targetFile.getParentFile
if (parentDir != null && !parentDir.exists()) {
Files.createDirectories(parentDir.toPath)
logger.info(s"Created directory: ${parentDir.getAbsolutePath}")
}

val connection = url.openConnection().asInstanceOf[HttpURLConnection]
connection.setRequestMethod("GET")
connection.setConnectTimeout(30000) // 30 seconds
connection.setReadTimeout(300000) // 5 minutes for large file
// Download to a temp file first, then rename (atomic operation)
val tempFile = new java.io.File(targetFile.getAbsolutePath + ".tmp")

try {
val responseCode = connection.getResponseCode
if (responseCode != HttpURLConnection.HTTP_OK) {
throw new RuntimeException(s"HTTP request failed with status $responseCode")
}
logger.info(s"Downloading GFF from ${genomicsConfig.ybrowseGffUrl} to ${tempFile.getAbsolutePath}")

val inputStream = new BufferedInputStream(connection.getInputStream)
val outputStream = new FileOutputStream(tempFile)
val connection = url.openConnection().asInstanceOf[HttpURLConnection]
connection.setRequestMethod("GET")
connection.setConnectTimeout(30000) // 30 seconds
connection.setReadTimeout(300000) // 5 minutes for large file

try {
val buffer = new Array[Byte](8192)
var bytesRead = 0
var totalBytes = 0L

while ({ bytesRead = inputStream.read(buffer); bytesRead != -1 }) {
outputStream.write(buffer, 0, bytesRead)
totalBytes += bytesRead
val responseCode = connection.getResponseCode
if (responseCode != HttpURLConnection.HTTP_OK) {
throw new RuntimeException(s"HTTP request failed with status $responseCode")
}

logger.info(s"Downloaded $totalBytes bytes")
} finally {
inputStream.close()
outputStream.close()
}

// Atomic rename
if (targetFile.exists()) {
targetFile.delete()
}
if (!tempFile.renameTo(targetFile)) {
throw new RuntimeException(s"Failed to rename temp file to ${targetFile.getAbsolutePath}")
}
val inputStream = new BufferedInputStream(connection.getInputStream)
val outputStream = new FileOutputStream(tempFile)

logger.info(s"VCF file saved to ${targetFile.getAbsolutePath}")
} finally {
connection.disconnect()
}
}
try {
val buffer = new Array[Byte](8192)
var bytesRead = 0
var totalBytes = 0L

/**
* Sanitizes the VCF file by removing malformed records that HTSJDK cannot parse.
* Specifically filters out records with duplicate alleles (REF == ALT or duplicate ALT alleles).
*
* @return Try containing the number of skipped records
*/
private def sanitizeVcfFile(): Try[Int] = Try {
val sourceFile = genomicsConfig.ybrowseVcfStoragePath
val tempFile = new java.io.File(sourceFile.getAbsolutePath + ".sanitized.tmp")

logger.info(s"Sanitizing VCF file: ${sourceFile.getAbsolutePath}")

val inputStream = new BufferedReader(
new InputStreamReader(
new GZIPInputStream(
new BufferedInputStream(
new java.io.FileInputStream(sourceFile)
)
)
)
)

val outputStream = new java.io.PrintWriter(
new java.io.OutputStreamWriter(
new GZIPOutputStream(
new FileOutputStream(tempFile)
)
)
)

var skippedCount = 0
var lineNumber = 0

try {
var line: String = null
while ({ line = inputStream.readLine(); line != null }) {
lineNumber += 1
if (line.startsWith("#")) {
// Header line - pass through
outputStream.println(line)
} else {
// Data line - check for duplicate alleles
if (isValidVcfDataLine(line)) {
outputStream.println(line)
} else {
skippedCount += 1
if (skippedCount <= 10) {
logger.warn(s"Skipping malformed VCF record at line $lineNumber: ${line.take(100)}...")
}
while ({ bytesRead = inputStream.read(buffer); bytesRead != -1 }) {
outputStream.write(buffer, 0, bytesRead)
totalBytes += bytesRead
}

logger.info(s"Downloaded $totalBytes bytes")
} finally {
inputStream.close()
outputStream.close()
}
}

if (skippedCount > 10) {
logger.warn(s"Skipped ${skippedCount - 10} additional malformed records (warnings suppressed)")
}
} finally {
inputStream.close()
outputStream.close()
}
// Atomic rename
if (targetFile.exists()) {
targetFile.delete()
}
if (!tempFile.renameTo(targetFile)) {
throw new RuntimeException(s"Failed to rename temp file to ${targetFile.getAbsolutePath}")
}

// Replace original with sanitized version
if (sourceFile.exists()) {
sourceFile.delete()
}
if (!tempFile.renameTo(sourceFile)) {
throw new RuntimeException(s"Failed to rename sanitized file to ${sourceFile.getAbsolutePath}")
logger.info(s"GFF file saved to ${targetFile.getAbsolutePath}")
} finally {
connection.disconnect()
}
}

logger.info(s"VCF sanitization complete. Processed $lineNumber lines, skipped $skippedCount malformed records.")
skippedCount
}

/**
* Validates a VCF data line for common issues that break HTSJDK parsing.
* Checks for:
* - Duplicate alleles (REF appearing in ALT, or duplicate ALT alleles)
* - Empty required fields
*/
private def isValidVcfDataLine(line: String): Boolean = {
val fields = line.split("\t", 6) // Only need first 5 fields: CHROM, POS, ID, REF, ALT
if (fields.length < 5) return false

val ref = fields(3).toUpperCase
val altField = fields(4)

// Handle missing ALT (just ".")
if (altField == ".") return true

val alts = altField.split(",").map(_.toUpperCase)

// Check for duplicate alleles
val allAlleles = ref +: alts
val uniqueAlleles = allAlleles.distinct

// If we have fewer unique alleles than total, there are duplicates
uniqueAlleles.length == allAlleles.length
}
}
5 changes: 5 additions & 0 deletions app/config/FeatureFlags.scala
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,9 @@ class FeatureFlags @Inject()(config: Configuration) {
* Disabled by default until age data is populated.
*/
val showBranchAgeEstimates: Boolean = featuresConfig.getOptional[Boolean]("tree.showBranchAgeEstimates").getOrElse(false)

/**
* Show the alternative "Block Layout" (ytree.net style) for the tree.
*/
val showVerticalTree: Boolean = featuresConfig.getOptional[Boolean]("tree.showVerticalTree").getOrElse(false)
}
4 changes: 2 additions & 2 deletions app/config/GenomicsConfig.scala
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ class GenomicsConfig @Inject()(config: Configuration) {
}

// YBrowse configuration
val ybrowseVcfUrl: String = genomicsConfig.get[String]("ybrowse.vcf_url")
val ybrowseVcfStoragePath: File = new File(genomicsConfig.get[String]("ybrowse.vcf_storage_path"))
val ybrowseGffUrl: String = genomicsConfig.get[String]("ybrowse.gff_url")
val ybrowseGffStoragePath: File = new File(genomicsConfig.get[String]("ybrowse.gff_storage_path"))

/**
* Retrieves the path to a liftover chain file for a given source and target genome.
Expand Down
Loading
Loading