-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpool_junction_lists.py
More file actions
104 lines (93 loc) · 4.44 KB
/
pool_junction_lists.py
File metadata and controls
104 lines (93 loc) · 4.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/usr/bin/env python
"""
pool_junction_lists
Parses a SORTED combined junction list with final column
Combines junction data from input list of intropolis-like files to
create a single pooled junctions file from which
a master morna index can be made
"""
import argparse
import gzip
import sys
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', metavar='<file>', type=str,
required=True,
help='path to gzipped input file of combined junctions'
'eg: sorted_cjs.tsv.gz'
)
parser.add_argument('-s','--sources', nargs='+', metavar='<files>', type=str,
required=True,
help=('list of sources which must exactly match the last column '
'values of the input file; this helps align the sample ids'
'eg: tcga sra gtex')
)
parser.add_argument('-m','--metafiles', nargs='+', metavar='<files>', type=str,
required=False,
default=None,
help=('list of metadata files matching up with sources in order;'
'must have tab separated first field with sample id, then'
'remainder of line is any and all metadata that should be'
'associated with that sample id, such as accession number'
'and other useful details to include with search results.'
'If included, output will include a combined metadata file'
'named source1_source2..._metadata.tsv ')
)
parser.add_argument('-n', '--increment', metavar='<int>', type=int,
required=True,
help='since each input file has its own integer sample ids '
'which start from 0, increment the ids in each subsequent file '
'by this much for each file it is after the first, which must '
'be made enough to separate the sample id ranges entirely'
'eg: since TCGA has 11349 samples, GTEX 9662 and SRA 20854,'
'25000 is an appropriate and readable increment'
)
parser.add_argument('-o', '--outfile', metavar='<file>', type=str,
required=True,
help='path to output file, which will be gzipped'
'eg: pooled_junctions.tsv.gz'
)
#junction_line_offsets = {}
junction_presence = {}
stored_tokens=""
if __name__ == '__main__':
args = parser.parse_args()
#this source offset dictionary is used to alter sample ids
source_offset = {}
for i,source in enumerate(args.sources):
source_offset[source] = i * args.increment
print source_offset
with gzip.open(args.input) as input_handle,\
gzip.open(args.outfile, "w") as out_handle:
for i, line in enumerate(input_handle):
if (i % 1000 == 0):
sys.stdout.write(str(i)
+ " lines into combined file " + args.input + "\r")
sys.stdout.flush()
tokens = line.strip().split("\t")
#we'll have to adjust the sample ids by an appropriate offset
offset = source_offset[tokens[-1]]
#so break them out into numbers, add offset, and back to string!
tokens[-3] = ",".join(str(int(x) + offset)
for x in tokens[-3].split(","))
#if this line isn't the same junction as the stored, write stored
if stored_tokens[:3] != tokens[:3]:
#write accumulated junction line, leaving out source indicator
if i>0:
out_handle.write("\t".join(stored_tokens[:-1])+"\n")
#and move current tokens into stored
stored_tokens = tokens
#otherwise, this is a continuation of a junction from another source
else:
#add newly offset sample ids and coverages
stored_tokens[-3] = stored_tokens[-3] + "," + tokens[-3]
stored_tokens[-2] = stored_tokens[-2] + "," + tokens[-2]
#finish off with final outfile write
out_handle.write("\t".join(stored_tokens[:-1])+"\n")
if len(args.metafiles) > 0:
with open("_".join(args.sources) + "_metadata.tsv","w") as meta_out:
for i,file in enumerate(args.metafiles):
with open(file) as metafile:
for line in metafile:
tokens = line.strip().split("\t")
tokens[0]= str(int(tokens[0]) + i * args.increment)
meta_out.write("\t".join(tokens) + "\n")