-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathcontigStats.py
More file actions
83 lines (73 loc) · 2.28 KB
/
contigStats.py
File metadata and controls
83 lines (73 loc) · 2.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import sys
infile = open(sys.argv[1], 'r')
def get_next_fasta (fileObject):
'''usage: for header, seq in get_next_fasta(fileObject):
This is a generator that returns one fasta record's header and
sequence at a time from a multiple fasta file. Return character is removed
from the header. The sequence is returned as one continuous string
with no returns. The returned value is a tuple (header, sequence)
If their is no sequence associated with a header, seq will be an
empty string
Code simplification contributed by Dattatreya Mellacheruvu
01/16/2009, Jeffrey R. de Wet
'''
header = ''
seq = ''
#The following for loop gets the header of the first fasta
#record. Skips any leading junk in the file
for line in fileObject:
if line.startswith('>'):
header = line.strip()
break
for line in fileObject:
if line.startswith('>'):
yield header, seq
header = line.strip()
seq = ''
else:
seq += line.strip()
#yield the last entry
if header:
yield header, seq
def sd(myList, avg):
tmp = []
for item in myList:
tmp.append((item - avg)**2)
SD = (float(sum(tmp))/len(tmp))**0.5
return SD
def median(s):
s= sorted(s)
i = len(s)
if not i%2:
return (s[(i/2)-1]+s[i/2])/2.0
return s[i/2]
totalLen = 0
totalSeqs = 0
longest = 0
tmpcount = 0
shortest = 10000000000000000000000000000
lengthlist = []
for header, seq in get_next_fasta(infile):
length = len(seq)
lengthlist.append(length)
totalLen += length
totalSeqs += 1
if length > longest:
longest = length
if length < shortest:
shortest = length
if length >= 100:
tmpcount += 1
avgLen = totalLen/totalSeqs
trimmedAvg = (totalLen-(longest+shortest))/(totalSeqs-2)
print "total contigs:", totalSeqs
print "average length:", avgLen, "bp"
print "trimmed average length:", trimmedAvg, "bp"
#print "other avg: ", float(sum(lengthlist))/len(lengthlist)
print "standard deviation: ", sd(lengthlist, avgLen)
print "median: ", median(lengthlist)
print "greater than or equal to 100: ", tmpcount
print "shortest conting:", shortest, "bp"
print "longest contig:", longest, "bp"
print "total length:", totalLen/1000000.00, "Mb"
infile.close()