seq-align-prep/classify.py at master · jpassaro/seq-align-prep · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
'''classify.py

This module implements the pre-alignment process designed by Justin and John
over approx. Nov 2010 to February 2011.

The primary objects being manipulated here will be variously called 'fragments',
'hits', or 'records'. They represent BLAST hits, i.e., results from a BLAST
search. They will usually be generated by the funcion hitsfromcsv(), which
operates on a file. Most functions taking hits expect they will have, at least,
the following attributes:

QSEQID (the name of the query sequence, generally a transposon)
SSEQID (the name of the subject sequence, generally some scaffold in a
        fly genome)
QSTART (the starting point of the query side of the alignment)
QEND   (the ending point of same)
SSTART (the starting point of the subject side of the alignment)
SEND   (the ending point of same)
EVALUE (the evalue score of the hit, i.e. a measure of how strong the match is)
SSEQ   (the matching part of the subject sequence)

These attributes are also expected, but are set automatically by hitsfromcsv():

LENGTH   (the length of the hit, measured by subject sequence)
ORIENTED (True if the subject sequence and the query sequence are oriented the
          same, False otherwise)
_SSTART  (The smaller of SSTART and SEND)
_SEND    (The larger of SSTART and SEND)

Note: there is no _QSTART/_QEND. We assume QSTART < QEND.

Attributes can be accessed in the form h.QSEQID or h.['QSEQID']; they may only
be modified in the latter form.
'''

# As a rule, objects whose name begins with an underscore (e.g. the function
# _dist) are intended for use only in this module, and not intended to be seen
# by users.

import os as _os, itertools as _it, os.path as _path, argparse as _arg
import functools as _func, contextlib as _cont, re as _re, customcsv as _csv
import fasta, utils
from operator import attrgetter as _attrget, itemgetter as _itemget
try: from cStringIO import StringIO as _sIO
except ImportError: from StringIO import StringIO as _sIO

# This is, bar none, *the most important function in the module*, due to
# which prominence it is given first. It refers to several functions (e.g.
# stratify(), classifyrecords()) that are defined later.
def full_transposon_treatment(seq,overlap,gap,minlength,fastaout,evalue=None,
                              fname=None):
    '''This is where it all comes together. This takes a sequence of
    hits, assumed to constitute an entire a blast search between one
    transposon and one fly genome. (See note below.)  It performs the
    main process of this module -- i.e., creating the input for a
    multiple-alignment -- and dumps that information in FASTA format
    to *fastaout*, which must be a writeable fasta object (see module
    *fasta*). The user is naturally responsible for closing both, if
    appropriate (as it is in almost all cases).

    NOTE: Generally it is best to have *seq* come from the function
    hitsfromcsv(). This can be done implicitly by giving None as the
    first argument, in which case *f* is expected to be a file object
    or filename to be given to hitstocsv().
    '''
    if None not in (seq,fname):
          raise Error("Cannot give both seq and fname arguments")
    elif seq is None: seq = hitsfromcsv(fname)
    for s,hits in utils.groupby(seq,key=_attrget('SSEQID')).iteritems():
       for island in makeislands(hits,gap):
          singles,nests = classifyrecords(island,overlap)
          nests = [stratify(N,minlength) for N in nests]
          if singles or any(nests):
            fastaout.writeentries(resolve_query_overlap(singles,nests,overlap))
          else: raise Error('No records result from file {!r}'.format(fname))

_intflds = ('QSTART','QEND','SSTART','SEND')
_txtflds = ('QSEQID','SSEQID','SSEQ')
_fltflds = ('EVALUE',)
allflds = tuple(_it.chain(_txtflds,_intflds,_fltflds))
_intflds,_txtflds,_fltflds = map(frozenset,(_intflds,_txtflds,_fltflds))
def hitsfromcsv(f_obj,intflds=(),fltflds=(),txtflds=(),evalue=None,**kwds):
    """Uses parseHeaderedCSV (see the module customcsv.py) to read a CSV
    file containing blast hits. The {int,flt,txt}flds options all behave
    as in that function, except they are augmented with the fields listed
    in *_{int,flt,txt}flds* respectively; as a result, if the CSV file
    is missing any of those fields (conveniently aggregated in
    *allflds*), this function will raise a csv.Error.

    There is support for evalue-based filtering; if the *evalue* argument
    is present, this function will yield only hits whose evalue is less.

    setlength() is applied to each hit; each hit also receives an
    orientation, according to whether its SSTART/SEND numbers are in order
    or reversed, and _SSTART/_SEND values representing the subject ordinates
    in order. See top-level module documentation.
    """
    for h in _csv.parseHeaderedCSV(f_obj,intflds=_intflds|set(intflds),
                                   fltflds=_fltflds|set(fltflds),
                                   txtflds=_txtflds|set(txtflds),**kwds):
      if evalue is not None and h.EVALUE >= evalue: continue
      h['ORIENTED'],h['_SSTART'],h['_SEND'] = \
        (True,h.SSTART,h.SEND) if h.SSTART<=h.SEND else (False,h.SEND,h.SSTART)
      setlength(h); yield h

def makeislands(seq,gaplength):
    L = list(utils.components(seq,rel=lambda x,y: s_distance(x,y)<=gaplength))
    L.sort(key=lambda L: L[0].SSTART)
    for i,island in enumerate(L,1):
        suff = '_{}'.format(i)
        for hit in island: hit['SSEQID'] += suff
    return L

def classifyrecords(seq,overlap):
    '''Takes a sequence of blast hits; picks out as 'nests' sequences of
    adjacent hits that overlap with a neighbor. Returns a pair of lists:
    the first contains records that were not part of a nest, and the second
    contains nests, i.e.  lists of the resulting 'extracted' hits from that
    nest.

    This function does not stratify those nests - for use in the final
    algorithm, they must go through the function stratify().
    '''
    sings,nests = utils.bifilter(
                     utils.components(seq,lambda x,y: s_overlap(x,y)>=overlap),
                     key=lambda x: len(x)==1)
    return (map(_itemget(0),sings),nests)

def stratify(nest,minlength):
    '''Generator function that takes a nest, i.e. a list of hits for which
    each adjacent pair has nontrivial overlap in the subject ordinates,
    and yields according to the following process:
    - Yield x, the hit with lowest EVALUE (largest length is tiebreaker)
    - For each other hit y, truncate (or if necessary, split) y so as to
      remove any overlap with x. If this makes y a trivial hit, i.e. one
      whose length is less than minlength, remove it from the nest entirely.
    - Repeat until the nest has been exhausted.

    The process is actually implemented in an abstract fashion using a
    helper function - see _stratify().
    '''
    return _stratify(nest,
        rank=hit_rank,
        filterfunc=lambda x: x.LENGTH > minlength,
        sget=_attrget('_SSTART'),
        eget=_attrget('_SEND'),
        sset=set__SSTART,
        eset=set__SEND)

def resolve_query_overlap(standalones,nests,overlap):
    '''Expects a list of standalone fragments and a list of nests. Nests are
    expected to have undergone the subject overlap truncation scheme (see
    *stratify*). The return value is a list of fasta entries. If the function
    detects no query overlap between any pair of fragments -- including those
    in nests -- the fragments are "assembled" (non-technical term) in order
    of query-ordinates into a single fasta entry, which is the only element of
    the returned list.
    '''
    standalones = list(standalones)
    nests = map(list,nests)
    if not (standalones or any(nests)):
      raise Error('Tried to resolve query overlap on an empty set of records!')

    # assign names before reordering
    for j,hit in enumerate(standalones): setname(hit,'standalone[{}]'.format(j))
    for i,nest in enumerate(nests,1):
        for j,hit in enumerate(nest): setname(hit,'nest{}[{}]'.format(i,j))

    # and then reorder by query ordinate
    recs = sorted(_it.chain(standalones,*nests),key=_attrget('QSTART'))

    if any(q_overlap(x,y)>=overlap for x,y in _it.izip(recs,recs[1:]))\
        or len(recs)==1: return _it.imap(make_entry,recs)
    prev = None
    with _cont.closing(_sIO()) as seq:
        for hit in recs:
            seq.write('-'*(hit.QSTART-1-( prev and prev.QEND or 0 )))
            if prev is None: st,end = hit._SSTART,hit._SEND
            else: st,end = min(st,hit._SSTART),max(end,hit._SEND);
            seq.write(hit.SSEQ)
            prev = hit
        result = fasta.seq_entry({'SEQ': seq.getvalue(),
         'NAME': _name_fmt.format(_GRP='all',SSEQID=recs[0].SSEQID,
         SSTART=min(h._SSTART for h in recs),SEND=max(h._SEND for h in recs))})
    return [result]

class Error(Exception):
    """Special exception class thrown by functions in this module."""
    def __init__(self,*args): Exception.__init__(self,*args)
    def __hash__(self): return hash(tuple(self.items()))

def setlength(h):
    '''*h* is a hit; this sets its length by the formula abs(SEND-SSTART)+1.'''
    h['LENGTH'] = abs(h.SEND - h.SSTART) + 1

#This is a template for distances based on start/end functions "st"/"end".
def _dist(x,y,st,end): return max(0,max(st(x),st(y))-min(end(x),end(y)))
def s_distance(x,y):
    '''Measures distance between the subject ordinates of x and y. Returns 1
    if they are adjacent, 0 if they overlap.'''
    return _dist(x,y,_attrget('_SSTART'),_attrget('_SEND'))
def s_distance(x,y):
    '''Measures distance between the query ordinates of x and y. Returns 1
    if they are adjacent, 0 if they overlap.'''
    return _dist(x,y,_attrget('QSTART'),_attrget('QEND'))

def _overlap(x,y,st,end): return max(0,1+min(end(x),end(y))-max(st(x),st(y)))
def q_overlap(x,y):
    '''Returns the number of base pairs by which the query ordinates of x
    and y overlap. Returns zero if and only if they are disjoint.'''
    return _overlap(x,y,_attrget('QSTART'),_attrget('QEND'))
def s_overlap(x,y):
    '''Returns the number of base pairs by which the subject ordinates of x
    and y overlap. Returns zero if and only if they are disjoint.'''
    return _overlap(x,y,_attrget('_SSTART'),_attrget('_SEND'))

fmtstr = 'Can remove at most %d chars from given string (%d given)'
def extend(s,n):
    '''Stub function, included for debugging purposes: raises an error
    that the extending action attempted is not supported. This is made
    available to set__SSTART() and set__SEND() to ensure they only
    shorten, rather than extend, the hits they adjust: for if we want
    to extend a hit, how do we know what to add to the sequence?
    '''
    raise Error('extending a hit is not supported')
def shortenhead(s,n):
    '''This function shortens the given sequence string by *n* base pairs
    (not including gaps) at the beginning. Used by set__SSTART(), set__SEND().
    '''
    start = 0
    while start < len(s) and s[start] == '-': start += 1
    for i in xrange(n):
        if start >= len(s): raise Error(fmtstr % (i,n))
        start += 1
        while start < len(s) and s[start] == '-': start += 1
    return s[start:]
def shortentail(s,n):
    '''This function shortens the given sequence string by *n* base pairs
    (not including gaps) at the end. Used by set__SSTART(), set__SEND().
    '''
    end = len(s)-1
    while end >= 0 and s[end] == '-': end -= 1
    for i in xrange(n):
        if end == -1: raise Error(fmtstr%(i,n))
        end -= 1
        while end >= 0 and s[end] == '-': end -= 1
    return s[:end+1]
_actions = (shortenhead, extend, shortentail, extend)
def set__SSTART(hit,value):
    '''Sets _SSTART to the given value, and adjusts other attributes (query and
    ordinates, sequence string, length, etc) accordingly.'''
    if value >= hit._SEND: return None
    if value == hit._SSTART: return hit
    hit,change,nori = hit.copy(),value - hit._SSTART,not hit.ORIENTED
    p = 'START' if hit.ORIENTED else 'END'
    hit['Q'+p] += change*(-1)**nori ; hit['_SSTART'] = hit['S'+p] = value
    hit['SSEQ'] = _actions[2*nori + (change<0)](hit.SSEQ,abs(change))
    setlength(hit) ; return hit
def set__SEND(hit,value):
    '''Sets _SEND to the given value, and adjusts other attributes (query and
    ordinates, sequence string, length, etc) accordingly.'''
    if value <= hit._SSTART: return None
    if value == hit._SEND: return hit
    hit,change = hit.copy(),value - hit._SEND
    p = 'END' if hit.ORIENTED else 'START'
    hit['Q'+p] -= change*(-1)**hit.ORIENTED ; hit['S'+p] = hit['_SEND'] = value
    hit['SSEQ'] = _actions[2*hit.ORIENTED + (change>0)](hit.SSEQ,abs(change))
    setlength(hit) ; return hit

def _stratify(nest,rank,filterfunc,sget,sset,eget,eset):
    '''This function implements the "subject overlap truncation scheme" for
    nests. Specifically, it pops the "best" element of the nest (according
    to the argument *rank*), truncates the rest to have no overlap with it
    (according to the argument *sget*), and filters the nest (using the
    argument *filterfunc*). The process repeats until the nest is exhausted.

    This is to be understood as an abstract version of the "stratify"
    function, with particulars represented abstractly to aid understanding
    and readability. In particular, all arguments except for *nest* are
    functions.'''
    def s_hit(main,other):
        '''Strips off the sequence part of the *main* hit from the *other*,
        and yields two hits, one, or none depending on whether or how they
        overlap.'''
        [[s,s_],[e,e_]] = [[f(x) for x in (main,other)] for f in (sget,eget)]
        if s_ < s-1: yield eset(other,min(e_,s-1))
        if e+1 < e_: yield sset(other,max(e+1,s_))
    nest = [(x,rank(x)) for x in nest if filterfunc(x)]
    while nest:
       (h,r),i = utils.popmax(nest,key=_itemget(1)),0 ; yield h
       while i < len(nest):
          results = [(x,rank(x)) for x in s_hit(h,nest[i][0]) if filterfunc(x)]
          nest[i:i+1] = results
          i += len(results)

def hit_rank(hit): return -hit.EVALUE,hit.LENGTH

_fillchar = '-'

_name_fmt = '{SSEQID}_{SSTART}-{SEND}_{_GRP}'
def setname(hit,grp):
    '''Gives a default 'NAME' attribute to this hit, if it doesn't already
    have one. It follows the format above, where the _GRP is given by argument
    *grp*.'''
    hit.setdefault('NAME',_name_fmt.format(_GRP=grp,**hit)) ; hit.show('NAME')

def make_entry(hit,padded=True):
    '''Turns a record into a fasta entry. The 'SEQ' field is set to a default,
     if not already present: 'NAME' reflects subject source
    information about the hit as well as the specified group, while 'SEQ' is
    set to 'SSEQ', padded on the left with *fillchar* (default '-') to fit the
    query location unless *padded* is set to False.
    '''
    hit = hit.copy() ; hit.open()
    hit.setdefault('SEQ',_fillchar*(hit.QSTART-1)*padded + hit.pop('SSEQ'))
    return fasta.seq_entry(hit)