Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions repository/BaselineOfBioSmalltalk/BaselineOfBioSmalltalk.class.st
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ BaselineOfBioSmalltalk >> baseline: spec [
projectSpecForPolyMath: spec;
projectSpecForDataFrame: spec;
projectSpecForINIParser: spec;
projectSpecForPetitParser: spec.
projectSpecForPetitParser: spec;
projectSpecForNeedlemanWunsch: spec.
"projectSpecForRoassal2: spec."

self baselineOSDeps: spec.
Expand Down Expand Up @@ -109,7 +110,7 @@ BaselineOfBioSmalltalk >> baselineCommonPackages: spec [
package: 'BioSoftwareCatalog' with: [ spec requires: #('BioTools' ). ];
package: 'BioSupport' with: [ spec requires: #('BioTools' 'DataFrame'). ];
package: 'BioTools' with: [ spec
requires: #('CommonUtils' 'StringExtensions' 'PetitParser2Core' 'PolyMath' 'XMLParser');
requires: #('CommonUtils' 'StringExtensions' 'PetitParser2Core' 'PolyMath' 'XMLParser' 'NeedlemanWunsch');
includes: #('BioPharoCommon') ];
package: 'BioToolsSamples' with: [ spec requires: #('BioTools' 'BioEntrez' 'BioParsers' ). ];
package: 'BioTools-Tests' with: [ spec requires: #('BioTools' ). ];
Expand Down Expand Up @@ -333,6 +334,14 @@ BaselineOfBioSmalltalk >> projectSpecForINIParser: spec [

]

{ #category : 'specs' }
BaselineOfBioSmalltalk >> projectSpecForNeedlemanWunsch: spec [

spec
baseline: 'NeedlemanWunsch'
with: [ spec repository: 'github://hernanmd/needleman-wunsch/src' ]
]

{ #category : 'specs' }
BaselineOfBioSmalltalk >> projectSpecForOsSubprocess: spec [

Expand Down
4 changes: 2 additions & 2 deletions repository/BioTools-Tests/BioAlignmentTest.class.st
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ Class {
#instVars : [
'align'
],
#category : 'BioTools-Tests-Core',
#category : 'BioTools-Tests-Align',
#package : 'BioTools-Tests',
#tag : 'Core'
#tag : 'Align'
}

{ #category : 'accessing' }
Expand Down
6 changes: 3 additions & 3 deletions repository/BioTools-Tests/BioAlphabetTest.class.st
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
Class {
#name : 'BioAlphabetTest',
#superclass : 'BioAbstractTest',
#category : 'BioTools-Tests-Alphabets',
#package : 'BioTools-Tests',
#tag : 'Alphabets'
#instVars : [
'alphabet'
],
#category : 'BioTools-Tests-Biological',
#package : 'BioTools-Tests',
#tag : 'Biological'
}

{ #category : 'testing' }
Expand Down
4 changes: 2 additions & 2 deletions repository/BioTools-Tests/BioCodonTableTest.class.st
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ Class {
#instVars : [
'table'
],
#category : 'BioTools-Tests-Biological',
#category : 'BioTools-Tests-Alphabets',
#package : 'BioTools-Tests',
#tag : 'Biological'
#tag : 'Alphabets'
}

{ #category : 'running' }
Expand Down
117 changes: 117 additions & 0 deletions repository/BioTools-Tests/BioHirschbergTest.class.st
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
Class {
#name : 'BioHirschbergTest',
#superclass : 'TestCase',
#category : 'BioTools-Tests-Align',
#package : 'BioTools-Tests',
#tag : 'Align'
}

{ #category : 'tests' }
BioHirschbergTest >> testAlignEmptyWithEmpty [
| result |
result := BioHirschberg new align: '' with: ''.
self assert: result firstSequence equals: ''.
self assert: result secondSequence equals: ''.
]

{ #category : 'tests' }
BioHirschbergTest >> testAlignEmptyWithNonEmpty [
| result |
result := BioHirschberg new align: '' with: 'GATTACA'.
self assert: result firstSequence equals: '-------'.
self assert: result secondSequence equals: 'GATTACA'.
]

{ #category : 'tests' }
BioHirschbergTest >> testAlignIdenticalSequences [
| result |
result := BioHirschberg new align: 'ACGTACGT' with: 'ACGTACGT'.
self assert: result firstSequence equals: 'ACGTACGT'.
self assert: result secondSequence equals: 'ACGTACGT'.
]

{ #category : 'tests' }
BioHirschbergTest >> testAlignNonEmptyWithEmpty [
| result |
result := BioHirschberg new align: 'GATTACA' with: ''.
self assert: result firstSequence equals: 'GATTACA'.
self assert: result secondSequence equals: '-------'.
]

{ #category : 'tests' }
BioHirschbergTest >> testAlignSingleCharacterMismatch [

| result |
result := BioHirschberg new align: 'A' with: 'T'.
self assert: result firstSequence size equals: 1.
self assert: result secondSequence size equals: 1.
self assert: result firstSequence equals: 'A'.
self assert: result secondSequence equals: 'T'
]

{ #category : 'tests' }
BioHirschbergTest >> testBaseCaseManyAgainstOne [

| result |
result := BioHirschberg new align: 'GATTACA' with: 'A'.
self
assert: result firstSequence size
equals: result secondSequence size.
self assert: (result secondSequence occurrencesOf: $A) equals: 1
]

{ #category : 'tests' }
BioHirschbergTest >> testBaseCaseOneAgainstMany [

| result |
result := BioHirschberg new align: 'A' with: 'GATTACA'.
self
assert: result firstSequence size
equals: result secondSequence size.
self assert: (result firstSequence occurrencesOf: $A) equals: 1
]

{ #category : 'tests' }
BioHirschbergTest >> testOnlyGapCharacterInsertedForPadding [

| result allowed |
result := BioHirschberg new align: 'GAT' with: 'GATTACA'.
allowed := (Set withAll: 'GATTACA') , (Set with: $-).
self assert:
(result firstSequence allSatisfy: [ :ch | allowed includes: ch ]).
self assert:
(result secondSequence allSatisfy: [ :ch | allowed includes: ch ])
]

{ #category : 'tests' }
BioHirschbergTest >> testResultSequencesHaveSameLength [

| result |
result := BioHirschberg new align: 'GATTACA' with: 'GCATGCU'.
self
assert: result firstSequence size
equals: result secondSequence size
]

{ #category : 'tests' }
BioHirschbergTest >> testSequencesWithGaps [

| result |
result := BioHirschberg new align: 'GATTACA' with: 'GAATACA'.
self assert: (result firstSequence includesSubstring: '-').
self assert: (result secondSequence includesSubstring: '-')
]

{ #category : 'tests' }
BioHirschbergTest >> testSequencesWithRepeatedGaps [

| result |
result := BioHirschberg new align: 'G--A' with: 'GA--'.

"Alignments should be of equal length"
self
assert: result firstSequence size
equals: result secondSequence size.
"Alignment contains gaps as expected"
self assert: (result firstSequence includesSubstring: '-')
]
10 changes: 6 additions & 4 deletions repository/BioTools-Tests/BioUnknownSeqTest.class.st
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,13 @@ BioUnknownSeqTest >> setUp [
{ #category : 'testing' }
BioUnknownSeqTest >> testAlphabet [

self assert: ( self sequence alphabet isKindOf: BioAlphabet ).
self assert: ( self sequence unknownLetter = BioAlphabet new defaultUnknownElement ).
self assert: (self sequence alphabet isKindOf: BioAlphabet).
self
assert: self sequence unknownLetter
equals: BioAlphabet new defaultUnknownElement.
self sequence unknownLetter: $?.
self assert: ( self sequence alphabet isKindOf: BioAlphabet ).
self assert: self sequence unknownLetter = $?
self assert: (self sequence alphabet isKindOf: BioAlphabet).
self assert: self sequence unknownLetter equals: $?
]

{ #category : 'testing' }
Expand Down
4 changes: 2 additions & 2 deletions repository/BioTools/BioAlignment.class.st
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ Class {
'consensusStrategy',
'isReference'
],
#category : 'BioTools-Core',
#category : 'BioTools-Align',
#package : 'BioTools',
#tag : 'Core'
#tag : 'Align'
}

{ #category : 'accessing' }
Expand Down
146 changes: 146 additions & 0 deletions repository/BioTools/BioHirschberg.class.st
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
"
Hirschberg's algorithm is a more space-efficient modification of the Needleman-Wunsch algorithm, designed for optimal sequence alignment. It uses a divide-and-conquer approach to achieve linear space complexity while maintaining the same time complexity as Needleman-Wunsch.

This implementation reduces space complexity from O(nm) to O(min(n,m)) while maintaining O(nm) time complexity.
"
Class {
#name : 'BioHirschberg',
#superclass : 'ALNeedlemanWunsch',
#category : 'BioTools-Align',
#package : 'BioTools',
#tag : 'Align'
}

{ #category : 'initialization' }
BioHirschberg >> align: sequence1 with: sequence2 [

| aligned |
self
firstSequence: sequence1;
secondSequence: sequence2.

(sequence1 isEmpty and: [ sequence2 isEmpty ]) ifTrue: [ ^ self ].

aligned := self recursiveAlignment: sequence1 with: sequence2.
self finalize: aligned first reversed with: aligned second reversed.
^ self
]

{ #category : 'accessing' }
BioHirschberg >> alignmentScoreOf: alignedA with: alignedB using: aligner [

| score |
score := 0.
1 to: alignedA size do: [ :i |
score := score
+
(aligner
matchScore: (alignedA at: i)
with: (alignedB at: i)) ].
^ score
]

{ #category : 'transforming' }
BioHirschberg >> backwardPass: aSeq with: bSeq [
"R[j] = score of aligning reverse(aSeq) with reverse(bSeq[1..j]) mapped back"

^ (self forwardPass: aSeq reversed with: bSeq reversed) reversed
]

{ #category : 'transforming' }
BioHirschberg >> forwardPass: aSeq with: bSeq [

| current previous diagonal up left |
current := Array new: bSeq size + 1.
0 to: bSeq size do: [ :j |
current at: j + 1 put: j * self gapPenalty ].

aSeq doWithIndex: [ :charA :i |
previous := current copy.
current at: 1 put: i * self gapPenalty.
1 to: bSeq size do: [ :j |
diagonal := (previous at: j)
+ (self matchScore: charA with: (bSeq at: j)).
up := (previous at: j + 1) + self gapPenalty.
left := (current at: j) + self gapPenalty.
current at: j + 1 put: ((diagonal max: up) max: left) ] ].
^ current
]

{ #category : 'initialization' }
BioHirschberg >> initialize [

super initialize.
gapCharacter := $-
]

{ #category : 'transforming' }
BioHirschberg >> needlemanWunschForSmallCases: aSeq with: bSeq [

| nw |
nw := self class superclass new.
nw
matchAward: self matchAward;
mismatchPenalty: self mismatchPenalty;
gapPenalty: self gapPenalty;
gapCharacter: self gapCharacter;
align: aSeq with: bSeq.
^ {
nw firstSequence.
nw secondSequence }
]

{ #category : 'transforming' }
BioHirschberg >> recursiveAlignment: aSeq with: bSeq [

| n m mid leftA rightA scoreL scoreR split leftPart rightPart |
n := aSeq size.
m := bSeq size.

n = 0 ifTrue: [
^ {
(String new: m withAll: self gapCharacter).
bSeq } ].

m = 0 ifTrue: [
^ {
aSeq.
(String new: n withAll: self gapCharacter) } ].

(n = 1 or: [ m = 1 ]) ifTrue: [
^ self needlemanWunschForSmallCases: aSeq with: bSeq ].

mid := n // 2.
leftA := aSeq copyFrom: 1 to: mid.
rightA := aSeq copyFrom: mid + 1 to: n.

scoreL := self forwardPass: leftA with: bSeq.
scoreR := self backwardPass: rightA with: bSeq.
split := self splitIndexFromForward: scoreL backward: scoreR.

leftPart := self
recursiveAlignment: leftA
with: (bSeq copyFrom: 1 to: split).
rightPart := self
recursiveAlignment: rightA
with: (bSeq copyFrom: split + 1 to: m).

^ {
(leftPart first , rightPart first).
(leftPart second , rightPart second) }
]

{ #category : 'transforming' }
BioHirschberg >> splitIndexFromForward: scoreL backward: scoreR [

| m bestIndex bestScore candidate |
m := scoreL size - 1.
bestIndex := 0.
bestScore := SmallInteger minVal.
0 to: m do: [ :j |
candidate := (scoreL at: j + 1) + (scoreR at: m - j + 1).
candidate > bestScore ifTrue: [
bestScore := candidate.
bestIndex := j ] ].
^ bestIndex
]