Skip to content

Commit 8f51ea8

Browse files
committed
Add code to compare incoming and existing files for readset and warn about duplicate names
1 parent cd22e7a commit 8f51ea8

File tree

1 file changed

+61
-0
lines changed

1 file changed

+61
-0
lines changed

SequenceAnalysis/src/org/labkey/sequenceanalysis/pipeline/ReadsetInitTask.java

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import org.labkey.api.pipeline.RecordedAction;
2828
import org.labkey.api.pipeline.RecordedActionSet;
2929
import org.labkey.api.pipeline.WorkDirectoryTask;
30+
import org.labkey.api.sequenceanalysis.SequenceAnalysisService;
3031
import org.labkey.api.sequenceanalysis.pipeline.TaskFileManager;
3132
import org.labkey.api.util.Compress;
3233
import org.labkey.api.util.FileType;
@@ -45,8 +46,10 @@
4546
import java.util.Arrays;
4647
import java.util.Collection;
4748
import java.util.Collections;
49+
import java.util.HashMap;
4850
import java.util.HashSet;
4951
import java.util.List;
52+
import java.util.Map;
5053
import java.util.Set;
5154

5255
/**
@@ -163,6 +166,8 @@ public RecordedActionSet run() throws PipelineJobException
163166
List<FileGroup> fileGroups = getHelper().getSettings().getFileGroups(getPipelineJob());
164167
List<SequenceReadsetImpl> readsets = getHelper().getSettings().getReadsets(getPipelineJob());
165168

169+
checkForDuplicateFileNames(readsets, fileGroups);
170+
166171
if (!SequenceNormalizationTask.shouldRunRemote(getJob()))
167172
{
168173
getJob().getLogger().info("No files required external normalization, processing inputs locally");
@@ -486,5 +491,61 @@ private static void moveInputToAnalysisDir(File input, SequenceJob job, Collecti
486491
throw new PipelineJobException(e);
487492
}
488493
}
494+
495+
private void checkForDuplicateFileNames(List<SequenceReadsetImpl> readsets, List<FileGroup> fileGroups) throws PipelineJobException
496+
{
497+
// check for duplicate filename between incoming and existing
498+
for (SequenceReadsetImpl r : readsets)
499+
{
500+
boolean readsetExists = r.getReadsetId() != null && r.getReadsetId() > 0;
501+
SequenceReadsetImpl existingReadset = readsetExists ? ((SequenceReadsetImpl) SequenceAnalysisService.get().getReadset(r.getReadsetId(), getJob().getUser())) : null;
502+
List<ReadDataImpl> preexistingReadData = readsetExists ? existingReadset.getReadDataImpl() : Collections.emptyList();
503+
if (!preexistingReadData.isEmpty())
504+
{
505+
Map<String, File> existingFileNames = new HashMap<>();
506+
preexistingReadData.forEach(rd -> {
507+
existingFileNames.put(rd.getFile1().getName(), rd.getFile1());
508+
if (rd.getFile2() != null)
509+
{
510+
existingFileNames.put(rd.getFile2().getName(), rd.getFile2());
511+
}
512+
});
513+
514+
Map<String, File> sharedFns = new HashMap<>();
515+
for (FileGroup fg : fileGroups)
516+
{
517+
if (r.getFileSetName() != null && r.getFileSetName().equals(fg.name))
518+
{
519+
for (FileGroup.FilePair fp : fg.filePairs)
520+
{
521+
if (existingFileNames.containsKey(fp.file1.getName()))
522+
{
523+
sharedFns.put(fp.file1.getName(), fp.file1);
524+
}
525+
526+
if (fp.file2 != null && existingFileNames.containsKey(fp.file2.getName()))
527+
{
528+
sharedFns.put(fp.file2.getName(), fp.file2);
529+
}
530+
}
531+
}
532+
}
533+
534+
if (!sharedFns.isEmpty())
535+
{
536+
getJob().getLogger().debug("Duplicate file names found between incoming and existing for: " + r.getName());
537+
for (String newFile : sharedFns.keySet())
538+
{
539+
long diff = Math.abs(sharedFns.get(newFile).length() - existingFileNames.get(newFile).length());
540+
getJob().getLogger().debug("File name: " + newFile + ", with size difference: " + diff);
541+
if (diff < 100)
542+
{
543+
throw new PipelineJobException("Identical filenames with nearly identical size detected between existing and new files for readset: " + r.getName());
544+
}
545+
}
546+
}
547+
}
548+
}
549+
}
489550
}
490551

0 commit comments

Comments
 (0)