Skip to content

Commit d44ae4b

Browse files
authored
feat(ENGKNOW-3046): Enable multiple cram reference files. (#102)
1 parent bc6a57c commit d44ae4b

14 files changed

Lines changed: 1290 additions & 68 deletions

File tree

gortools/src/test/java/gorsat/UTestCram.java

Lines changed: 30 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,17 @@
3131
import org.junit.Assert;
3232
import org.junit.Rule;
3333
import org.junit.Test;
34+
import org.junit.contrib.java.lang.system.RestoreSystemProperties;
3435
import org.junit.rules.TemporaryFolder;
3536

3637
import java.io.File;
3738
import java.io.IOException;
3839
import java.nio.charset.Charset;
3940
import java.nio.file.Paths;
41+
import java.util.List;
42+
43+
import static gorsat.TestUtils.LINE_SPLIT_PATTERN;
44+
import static org.gorpipe.gor.driver.providers.stream.datatypes.cram.CramIterator.KEY_REFERENCE_FORCE_FOLDER;
4045

4146
public class UTestCram {
4247

@@ -45,6 +50,9 @@ public class UTestCram {
4550
@Rule
4651
public TemporaryFolder workDir = new TemporaryFolder();
4752

53+
@Rule
54+
public RestoreSystemProperties restoreSystemProperties = new RestoreSystemProperties();
55+
4856
public static File createWrongConfigFile(File directory) throws IOException {
4957
return FileTestUtils.createTempFile(directory, "generic.gor",
5058
"buildPath\t../tests/data/ref_mini/chromSeq\n" +
@@ -106,7 +114,7 @@ public void readCramWithFastaReferenceFromConfig() {
106114
public void readCramWithFastaReferenceFromConfigException() throws IOException {
107115
File wrongConfigFile = createWrongConfigFile(workDir.getRoot());
108116
System.clearProperty("gor.driver.cram.fastareferencesource");
109-
String[] args = new String[]{
117+
String[] args = new String[] {
110118
"gor " + DataUtil.toFile("../tests/data/external/samtools/cram_query_sorted", DataType.CRAM),
111119
"-config",
112120
wrongConfigFile.getCanonicalPath()};
@@ -120,24 +128,29 @@ public void readCramWithFastaReferenceFromConfigException() throws IOException {
120128

121129
@Test
122130
public void readCramWithFastaReferenceAndGenerateMissingAttributes() {
123-
try {
124-
System.setProperty("gor.driver.cram.fastareferencesource", DataUtil.toFile("../tests/data/external/samtools/cram_query_sorted", DataType.FASTA));
125-
System.setProperty("gor.driver.cram.generatemissingattributes", "false");
126-
String[] linesWithoutMissingAttributes = TestUtils.runGorPipeLines("gor " + DataUtil.toFile("../tests/data/external/samtools/cram_query_sorted", DataType.CRAM));
127-
System.setProperty("gor.driver.cram.generatemissingattributes", "true");
128-
String[] linesWithMissingAttributes = TestUtils.runGorPipeLines("gor " + DataUtil.toFile("../tests/data/external/samtools/cram_query_sorted", DataType.CRAM));
129-
130-
Assert.assertEquals(8, linesWithoutMissingAttributes.length);
131-
Assert.assertEquals(8, linesWithMissingAttributes.length);
132-
// See if we have the missing entry in the last column.
133-
Assert.assertFalse(linesWithoutMissingAttributes[1].contains("NM="));
134-
Assert.assertTrue(linesWithMissingAttributes[1].contains("NM="));
131+
System.setProperty("gor.driver.cram.fastareferencesource", DataUtil.toFile("../tests/data/external/samtools/cram_query_sorted", DataType.FASTA));
132+
System.setProperty(KEY_REFERENCE_FORCE_FOLDER, "false");
135133

136-
} finally {
137-
System.clearProperty("gor.driver.cram.fastareferencesource");
138-
System.clearProperty("gor.driver.cram.generatemissingattributes");
139-
}
134+
String[] args = new String[] {"gor " + DataUtil.toFile("../tests/data/external/samtools/cram_query_sorted", DataType.CRAM)};
135+
136+
System.setProperty("gor.driver.cram.generatemissingattributes", "false");
137+
String[] linesWithoutMissingAttributes = TestUtils.runGorPipe(args, false).split(LINE_SPLIT_PATTERN);
138+
139+
System.setProperty("gor.driver.cram.generatemissingattributes", "true");
140+
String[] linesWithMissingAttributesCramRef = TestUtils.runGorPipe(args, false).split(LINE_SPLIT_PATTERN);
141+
142+
args = new String[] {
143+
"gor " + DataUtil.toFile("../tests/data/external/samtools/cram_query_sorted", DataType.CRAM)
144+
, "-config", "../tests/data/ref_mini/gor_config.txt"};
145+
String[] linesWithMissingAttributesProjectRef = TestUtils.runGorPipe(args, false).split(LINE_SPLIT_PATTERN);
140146

147+
Assert.assertEquals(8, linesWithoutMissingAttributes.length);
148+
Assert.assertEquals(8, linesWithMissingAttributesCramRef.length);
149+
Assert.assertEquals(8, linesWithMissingAttributesProjectRef.length);
150+
// See if we have the missing entry in the last column.
151+
Assert.assertFalse(linesWithoutMissingAttributes[1].contains("NM="));
152+
Assert.assertTrue(linesWithMissingAttributesCramRef[1].contains("NM="));
153+
Assert.assertTrue(linesWithMissingAttributesProjectRef[1].contains("NM="));
141154
}
142155

143156
@Test(expected = GorResourceException.class)

model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/CramIterator.java

Lines changed: 46 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -32,17 +32,19 @@
3232
import org.apache.commons.io.FilenameUtils;
3333
import org.apache.commons.lang3.StringUtils;
3434
import org.gorpipe.exceptions.GorResourceException;
35-
import org.gorpipe.gor.driver.meta.DataType;
3635
import org.gorpipe.gor.driver.providers.stream.datatypes.bam.BamIterator;
36+
import org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference.CompositeReferenceSource;
37+
import org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference.EBIReferenceSource;
38+
import org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference.FolderReferenceSource;
39+
import org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference.SharedFastaReferenceSource;
3740
import org.gorpipe.gor.model.ChromoLookup;
3841
import org.gorpipe.gor.session.GorSession;
3942
import org.gorpipe.gor.driver.adapters.StreamSourceSeekableStream;
4043
import org.gorpipe.gor.driver.providers.stream.sources.StreamSource;
4144
import org.gorpipe.gor.table.util.PathUtils;
42-
import org.gorpipe.gor.util.DataUtil;
43-
import org.gorpipe.gor.util.StringUtil;
4445
import org.gorpipe.gor.model.Row;
45-
import org.gorpipe.gor.model.SharedFastaReferenceSource;
46+
import org.gorpipe.model.gor.iterators.RefSeq;
47+
import org.gorpipe.util.Strings;
4648
import org.slf4j.Logger;
4749
import org.slf4j.LoggerFactory;
4850

@@ -66,19 +68,21 @@
6668
*/
6769
public class CramIterator extends BamIterator {
6870

69-
private final static String KEY_GENERATEMISSINGATTRIBUTES = "gor.driver.cram.generatemissingattributes";
70-
private final static String KEY_FASTAREFERENCESOURCE = "gor.driver.cram.fastareferencesource";
71+
public final static String KEY_GENERATEMISSINGATTRIBUTES = "gor.driver.cram.generatemissingattributes";
72+
public final static String KEY_FASTAREFERENCESOURCE = "gor.driver.cram.fastareferencesource";
73+
public final static String KEY_REFERENCE_FORCE_FOLDER = "gor.driver.cram.reference.force.folder.";
7174

7275
private static final Logger log = LoggerFactory.getLogger(CramIterator.class);
7376

7477
private CramFile cramFile;
7578
private int[] columns;
7679
ChromoLookup lookup;
7780
private String fileName;
78-
private String cramReferencePath = "";
79-
private CRAMFileReader cramFileReader;
80-
private ReferenceSequenceFile referenceSequenceFile;
81+
private String projectCramReferencePath; // Cram reference path from project context.
82+
private RefSeq projectRefSeq; // RefSeq from project context, used for MD tag calculation.
83+
private ReferenceSequenceFile referenceSequenceFile; // Handle to cram reference file, fallback for MD tag calculation.
8184
private CRAMReferenceSource referenceSource;
85+
private CRAMFileReader cramFileReader;
8286
private boolean generateMissingCramAttributes;
8387

8488
/**
@@ -94,35 +98,6 @@ public CramIterator(ChromoLookup lookup, CramFile cramFile, int[] columns) {
9498
this.lookup = lookup;
9599
}
96100

97-
98-
/**
99-
* Construct a CramIterator
100-
*
101-
* @param lookup The lookup service for chromosome name to ids
102-
* @param file The CRAM File to iterate through
103-
*/
104-
public CramIterator(ChromoLookup lookup, String file, String index, String reference, boolean generateMissingAttributes) {
105-
106-
fileName = file;
107-
generateMissingCramAttributes = generateMissingAttributes;
108-
File cramfile = new File(file);
109-
File cramindex = new File(index);
110-
if (!cramindex.exists()) {
111-
cramindex = new File(DataUtil.toFile(file, DataType.CRAI));
112-
}
113-
114-
referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(reference));
115-
referenceSource = createReferenceSource(fileName, "");
116-
117-
try {
118-
cramFileReader = new CRAMFileReader(cramfile, new FileInputStream(cramindex), referenceSource);
119-
} catch (FileNotFoundException e) {
120-
throw new GorResourceException("Cram file not found.", file, e);
121-
}
122-
SamReader samreader = new SamReader.PrimitiveSamReaderToSamReaderAdapter(cramFileReader, null);
123-
init(lookup, samreader, true);
124-
}
125-
126101
@Override
127102
public Row next() {
128103
Row row = super.next();
@@ -135,8 +110,16 @@ public Row next() {
135110
boolean calculateNM = record.getIntegerAttribute(SAMTag.NM.name()) == null;
136111

137112
if (calculateMD) {
138-
byte[] referenceBytes = referenceSequenceFile.getSubsequenceAt(record.getContig(), record.getAlignmentStart(), record.getAlignmentEnd()).getBases();
139-
CramUtils.calculateMdAndNmTags(record, referenceBytes, calculateMD, calculateNM);
113+
byte[] referenceBytes = null;
114+
if (projectRefSeq != null) {
115+
referenceBytes = projectRefSeq.getBases(record.getContig(), record.getAlignmentStart(), record.getAlignmentEnd()).getBytes();
116+
} else if (referenceSequenceFile != null) {
117+
// Fallback to the reference file used by the CRAM reader
118+
referenceBytes = referenceSequenceFile.getSubsequenceAt(record.getContig(), record.getAlignmentStart(), record.getAlignmentEnd()).getBases();
119+
}
120+
if (referenceBytes != null) {
121+
CramUtils.calculateMdAndNmTags(record, referenceBytes, calculateMD, calculateNM);
122+
}
140123
} else if (calculateNM) {
141124
SequenceUtil.calculateSamNmTagFromCigar(record);
142125
}
@@ -170,7 +153,10 @@ public void init(GorSession session) {
170153
return;
171154
}
172155

173-
cramReferencePath = session.getProjectContext().getReferenceBuild().getCramReferencePath();
156+
projectCramReferencePath = session.getProjectContext().getReferenceBuild().getCramReferencePath();
157+
if (!Strings.isNullOrEmpty(session.getProjectContext().getGorConfigFile())) {
158+
projectRefSeq = session.getProjectContext().createRefSeq();
159+
}
174160

175161
if (cramFile != null) {
176162
// I read this property here through System.getProperty as there is no other way to pass properties to the driver
@@ -237,7 +223,7 @@ private CRAMReferenceSource createReferenceSource(String ref, String root) {
237223
}
238224

239225
// This reference should be fasta but we let the htsjdk library decide
240-
return createFileReference(file.toString());
226+
return createFileReference(file);
241227
}
242228

243229
private File getReferenceFromGorOptions(File file) {
@@ -252,8 +238,8 @@ private File getReferenceFromGorOptions(File file) {
252238
}
253239

254240
private File getReferenceFromGorConfig(File file, String root) {
255-
if (!file.exists() && !StringUtil.isEmpty(cramReferencePath)) {
256-
return PathUtils.resolve(Paths.get(root), Paths.get(cramReferencePath)).toFile();
241+
if (!file.exists() && !Strings.isNullOrEmpty(projectCramReferencePath)) {
242+
return PathUtils.resolve(Paths.get(root), Paths.get(projectCramReferencePath)).toFile();
257243
}
258244
return file;
259245
}
@@ -277,10 +263,22 @@ private File getReferenceFromReferenceLinkFile(File file) {
277263
return file;
278264
}
279265

280-
private CRAMReferenceSource createFileReference(String ref) {
281-
String referenceKey = FilenameUtils.removeExtension(FilenameUtils.getBaseName(ref));
282-
referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(ref));
283-
return new SharedFastaReferenceSource(referenceSequenceFile, referenceKey);
266+
private CRAMReferenceSource createFileReference(File refFile) {
267+
if (refFile.isDirectory()) {
268+
return new CompositeReferenceSource(List.of(
269+
new FolderReferenceSource(refFile.getPath()),
270+
new EBIReferenceSource(refFile.getPath())));
271+
} else if (Boolean.getBoolean(System.getProperty(KEY_REFERENCE_FORCE_FOLDER, "true"))) {
272+
return new CompositeReferenceSource(List.of(
273+
new FolderReferenceSource(refFile.getParent()),
274+
new EBIReferenceSource(refFile.getParent())));
275+
} else {
276+
referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile);
277+
278+
String referenceKey = FilenameUtils.removeExtension(refFile.getName());
279+
var referenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile);
280+
return new SharedFastaReferenceSource(referenceFile, referenceKey);
281+
}
284282
}
285283

286284
}
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
package org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference;
2+
3+
import htsjdk.samtools.SAMSequenceRecord;
4+
import htsjdk.samtools.cram.ref.CRAMReferenceSource;
5+
6+
import java.io.Closeable;
7+
import java.io.IOException;
8+
import java.util.ArrayList;
9+
import java.util.List;
10+
11+
/**
12+
* Composite reference source, tries out different reference source in order.
13+
*/
14+
public class CompositeReferenceSource implements CRAMReferenceSource, Closeable {
15+
16+
List<CRAMReferenceSource> sources;
17+
18+
public CompositeReferenceSource(List<CRAMReferenceSource> sources) {
19+
this.sources = sources != null ? new ArrayList<>(sources) : new ArrayList<>();
20+
}
21+
22+
@Override
23+
public byte[] getReferenceBases(SAMSequenceRecord sequenceRecord, boolean tryNameVariants) {
24+
byte[] bytes = null;
25+
for (var source : sources) {
26+
bytes = source.getReferenceBases(sequenceRecord, tryNameVariants);
27+
if (bytes != null) {
28+
return bytes;
29+
}
30+
}
31+
return bytes;
32+
}
33+
34+
@Override
35+
public byte[] getReferenceBasesByRegion(SAMSequenceRecord sequenceRecord, int zeroBasedStart, int requestedRegionLength) {
36+
byte[] bytes = null;
37+
for (var source : sources) {
38+
bytes = source.getReferenceBasesByRegion(sequenceRecord, zeroBasedStart, requestedRegionLength);
39+
if (bytes != null) {
40+
return bytes;
41+
}
42+
}
43+
return bytes;
44+
}
45+
46+
@Override
47+
public void close() throws IOException {
48+
for (var source : sources) {
49+
if (source instanceof Closeable) {
50+
((Closeable) source).close();
51+
}
52+
}
53+
sources.clear();
54+
}
55+
}

0 commit comments

Comments
 (0)