3232import org .apache .commons .io .FilenameUtils ;
3333import org .apache .commons .lang3 .StringUtils ;
3434import org .gorpipe .exceptions .GorResourceException ;
35- import org .gorpipe .gor .driver .meta .DataType ;
3635import org .gorpipe .gor .driver .providers .stream .datatypes .bam .BamIterator ;
36+ import org .gorpipe .gor .driver .providers .stream .datatypes .cram .reference .CompositeReferenceSource ;
37+ import org .gorpipe .gor .driver .providers .stream .datatypes .cram .reference .EBIReferenceSource ;
38+ import org .gorpipe .gor .driver .providers .stream .datatypes .cram .reference .FolderReferenceSource ;
39+ import org .gorpipe .gor .driver .providers .stream .datatypes .cram .reference .SharedFastaReferenceSource ;
3740import org .gorpipe .gor .model .ChromoLookup ;
3841import org .gorpipe .gor .session .GorSession ;
3942import org .gorpipe .gor .driver .adapters .StreamSourceSeekableStream ;
4043import org .gorpipe .gor .driver .providers .stream .sources .StreamSource ;
4144import org .gorpipe .gor .table .util .PathUtils ;
42- import org .gorpipe .gor .util .DataUtil ;
43- import org .gorpipe .gor .util .StringUtil ;
4445import org .gorpipe .gor .model .Row ;
45- import org .gorpipe .gor .model .SharedFastaReferenceSource ;
46+ import org .gorpipe .model .gor .iterators .RefSeq ;
47+ import org .gorpipe .util .Strings ;
4648import org .slf4j .Logger ;
4749import org .slf4j .LoggerFactory ;
4850
6668 */
6769public class CramIterator extends BamIterator {
6870
69- private final static String KEY_GENERATEMISSINGATTRIBUTES = "gor.driver.cram.generatemissingattributes" ;
70- private final static String KEY_FASTAREFERENCESOURCE = "gor.driver.cram.fastareferencesource" ;
71+ public final static String KEY_GENERATEMISSINGATTRIBUTES = "gor.driver.cram.generatemissingattributes" ;
72+ public final static String KEY_FASTAREFERENCESOURCE = "gor.driver.cram.fastareferencesource" ;
73+ public final static String KEY_REFERENCE_FORCE_FOLDER = "gor.driver.cram.reference.force.folder." ;
7174
7275 private static final Logger log = LoggerFactory .getLogger (CramIterator .class );
7376
7477 private CramFile cramFile ;
7578 private int [] columns ;
7679 ChromoLookup lookup ;
7780 private String fileName ;
78- private String cramReferencePath = "" ;
79- private CRAMFileReader cramFileReader ;
80- private ReferenceSequenceFile referenceSequenceFile ;
81+ private String projectCramReferencePath ; // Cram reference path from project context.
82+ private RefSeq projectRefSeq ; // RefSeq from project context, used for MD tag calculation.
83+ private ReferenceSequenceFile referenceSequenceFile ; // Handle to cram reference file, fallback for MD tag calculation.
8184 private CRAMReferenceSource referenceSource ;
85+ private CRAMFileReader cramFileReader ;
8286 private boolean generateMissingCramAttributes ;
8387
8488 /**
@@ -94,35 +98,6 @@ public CramIterator(ChromoLookup lookup, CramFile cramFile, int[] columns) {
9498 this .lookup = lookup ;
9599 }
96100
97-
98- /**
99- * Construct a CramIterator
100- *
101- * @param lookup The lookup service for chromosome name to ids
102- * @param file The CRAM File to iterate through
103- */
104- public CramIterator (ChromoLookup lookup , String file , String index , String reference , boolean generateMissingAttributes ) {
105-
106- fileName = file ;
107- generateMissingCramAttributes = generateMissingAttributes ;
108- File cramfile = new File (file );
109- File cramindex = new File (index );
110- if (!cramindex .exists ()) {
111- cramindex = new File (DataUtil .toFile (file , DataType .CRAI ));
112- }
113-
114- referenceSequenceFile = ReferenceSequenceFileFactory .getReferenceSequenceFile (new File (reference ));
115- referenceSource = createReferenceSource (fileName , "" );
116-
117- try {
118- cramFileReader = new CRAMFileReader (cramfile , new FileInputStream (cramindex ), referenceSource );
119- } catch (FileNotFoundException e ) {
120- throw new GorResourceException ("Cram file not found." , file , e );
121- }
122- SamReader samreader = new SamReader .PrimitiveSamReaderToSamReaderAdapter (cramFileReader , null );
123- init (lookup , samreader , true );
124- }
125-
126101 @ Override
127102 public Row next () {
128103 Row row = super .next ();
@@ -135,8 +110,16 @@ public Row next() {
135110 boolean calculateNM = record .getIntegerAttribute (SAMTag .NM .name ()) == null ;
136111
137112 if (calculateMD ) {
138- byte [] referenceBytes = referenceSequenceFile .getSubsequenceAt (record .getContig (), record .getAlignmentStart (), record .getAlignmentEnd ()).getBases ();
139- CramUtils .calculateMdAndNmTags (record , referenceBytes , calculateMD , calculateNM );
113+ byte [] referenceBytes = null ;
114+ if (projectRefSeq != null ) {
115+ referenceBytes = projectRefSeq .getBases (record .getContig (), record .getAlignmentStart (), record .getAlignmentEnd ()).getBytes ();
116+ } else if (referenceSequenceFile != null ) {
117+ // Fallback to the reference file used by the CRAM reader
118+ referenceBytes = referenceSequenceFile .getSubsequenceAt (record .getContig (), record .getAlignmentStart (), record .getAlignmentEnd ()).getBases ();
119+ }
120+ if (referenceBytes != null ) {
121+ CramUtils .calculateMdAndNmTags (record , referenceBytes , calculateMD , calculateNM );
122+ }
140123 } else if (calculateNM ) {
141124 SequenceUtil .calculateSamNmTagFromCigar (record );
142125 }
@@ -170,7 +153,10 @@ public void init(GorSession session) {
170153 return ;
171154 }
172155
173- cramReferencePath = session .getProjectContext ().getReferenceBuild ().getCramReferencePath ();
156+ projectCramReferencePath = session .getProjectContext ().getReferenceBuild ().getCramReferencePath ();
157+ if (!Strings .isNullOrEmpty (session .getProjectContext ().getGorConfigFile ())) {
158+ projectRefSeq = session .getProjectContext ().createRefSeq ();
159+ }
174160
175161 if (cramFile != null ) {
176162 // I read this property here through System.getProperty as there is no other way to pass properties to the driver
@@ -237,7 +223,7 @@ private CRAMReferenceSource createReferenceSource(String ref, String root) {
237223 }
238224
239225 // This reference should be fasta but we let the htsjdk library decide
240- return createFileReference (file . toString () );
226+ return createFileReference (file );
241227 }
242228
243229 private File getReferenceFromGorOptions (File file ) {
@@ -252,8 +238,8 @@ private File getReferenceFromGorOptions(File file) {
252238 }
253239
254240 private File getReferenceFromGorConfig (File file , String root ) {
255- if (!file .exists () && !StringUtil . isEmpty ( cramReferencePath )) {
256- return PathUtils .resolve (Paths .get (root ), Paths .get (cramReferencePath )).toFile ();
241+ if (!file .exists () && !Strings . isNullOrEmpty ( projectCramReferencePath )) {
242+ return PathUtils .resolve (Paths .get (root ), Paths .get (projectCramReferencePath )).toFile ();
257243 }
258244 return file ;
259245 }
@@ -277,10 +263,22 @@ private File getReferenceFromReferenceLinkFile(File file) {
277263 return file ;
278264 }
279265
280- private CRAMReferenceSource createFileReference (String ref ) {
281- String referenceKey = FilenameUtils .removeExtension (FilenameUtils .getBaseName (ref ));
282- referenceSequenceFile = ReferenceSequenceFileFactory .getReferenceSequenceFile (new File (ref ));
283- return new SharedFastaReferenceSource (referenceSequenceFile , referenceKey );
266+ private CRAMReferenceSource createFileReference (File refFile ) {
267+ if (refFile .isDirectory ()) {
268+ return new CompositeReferenceSource (List .of (
269+ new FolderReferenceSource (refFile .getPath ()),
270+ new EBIReferenceSource (refFile .getPath ())));
271+ } else if (Boolean .getBoolean (System .getProperty (KEY_REFERENCE_FORCE_FOLDER , "true" ))) {
272+ return new CompositeReferenceSource (List .of (
273+ new FolderReferenceSource (refFile .getParent ()),
274+ new EBIReferenceSource (refFile .getParent ())));
275+ } else {
276+ referenceSequenceFile = ReferenceSequenceFileFactory .getReferenceSequenceFile (refFile );
277+
278+ String referenceKey = FilenameUtils .removeExtension (refFile .getName ());
279+ var referenceFile = ReferenceSequenceFileFactory .getReferenceSequenceFile (refFile );
280+ return new SharedFastaReferenceSource (referenceFile , referenceKey );
281+ }
284282 }
285283
286284}
0 commit comments