Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
defeb5f
Adds python dependencies in standard requirements format
hlapp May 15, 2024
3349eed
Merges changes from upstream (bmajoros/BlueSTARR)
hlapp May 21, 2024
8694839
Merge changes from upstream (bmajoros/BlueSTARR)
hlapp Jun 25, 2024
80fbe33
Adds conda environment definition
hlapp Jun 27, 2024
dbf4340
Add dockerfile for building BlueSTARR images (#4)
tncowart Aug 28, 2024
84bf59d
Adds GitHub Action workflow to automatically build Docker image
hlapp Sep 3, 2024
3d9d75c
Adds extra index URL directly into requirements
hlapp Nov 1, 2024
4e91b76
Adds environment setup instructions to README
hlapp Nov 1, 2024
4a45bbd
Adds scripts to fix TensorRT library failing to be found
hlapp Nov 1, 2024
0767303
Adds checking whether link target exists
hlapp Nov 2, 2024
e8541af
Adds documentation for fixing TensorRT not found
hlapp Nov 2, 2024
9e3194d
BlueSTARR version used for v0.1.1 model training
yutiachen5 Jan 16, 2025
b803208
Clarifies instructions for building conda environment
hlapp May 13, 2025
fa225fe
Adds sequence index to true-vs-predicted table
yutiachen5 Jul 6, 2025
6082e3e
Allows loading pretrained weights prior to training
hlapp Jul 5, 2025
0efced4
Adds making dropout after flatten optional
hlapp Sep 2, 2025
afbb5d8
Adds clean logic for calling as a program
hlapp Sep 2, 2025
12f3211
Merge pull request #14 from Duke-IGVF/preDense-dropout
hlapp Sep 2, 2025
ba96d8d
Adds library size normalization for custom loss (#15)
hlapp Sep 8, 2025
dcf4ad2
Fixed library size ratio calculation
hlapp Sep 30, 2025
e57f7a7
Updates transformer-based model architecture
hlapp Dec 3, 2025
9a8239a
Expands instructions for running BlueSTARR code
hlapp Mar 23, 2026
83df3ce
Adds initial section on data processing from STARR-seq
hlapp Mar 23, 2026
3647860
Adds documentation and links for downsampling
hlapp Mar 24, 2026
d56610e
Notebook for data preprocessing
RevathyVenukuttan Mar 25, 2026
edce05c
Update README.md
RevathyVenukuttan Mar 25, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions .github/workflows/build-deploy.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
name: Create and publish a Docker image

on:
push:
branches: main
paths-ignore:
- 'LICENSE*'
- 'README.md'
- 'requirements.txt'
- '*.yml'
release:
types: [published]

env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}

jobs:
build-and-push-image:
runs-on: ubuntu-latest
permissions:
contents: read
packages: write

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Log in to the Container registry
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
tags: |
type=raw,value=latest,enable={{is_default_branch}}
type=semver,pattern={{major}}.{{minor}}.{{patch}}
type=semver,pattern={{major}}.{{minor}}
type=semver,pattern={{major}}

- name: Build and push Python-only Docker image
uses: docker/build-push-action@v6
with:
context: .
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-to: type=gha,mode=max
cache-from: type=gha
123 changes: 77 additions & 46 deletions BlueSTARR-multitask.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from keras.layers import Dropout, Reshape, Dense, Activation, Flatten
from keras.layers import BatchNormalization, InputLayer, Input, LSTM, GRU, Bidirectional, Add, Concatenate, LayerNormalization, MultiHeadAttention
import keras_nlp
from keras_nlp.layers import SinePositionEncoding
from keras_nlp.layers import SinePositionEncoding, TransformerEncoder, RotaryEmbedding
from keras import models
from keras.models import Sequential, Model
from keras.optimizers import Adam
Expand Down Expand Up @@ -51,7 +51,7 @@
#=========================================================================
# main()
#=========================================================================
def main(configFile,subdir,modelFilestem):
def main(configFile,subdir,modelFilestem, pretrained=None):
startTime=time.time()
#random.seed(RANDOM_SEED)

Expand All @@ -62,17 +62,20 @@ def main(configFile,subdir,modelFilestem):
# Load data
print("loading data",flush=True)
shouldRevComp=config.RevComp==1
(X_train_sequence, X_train_seq_matrix, X_train, Y_train) = \
(X_train_sequence, X_train_seq_matrix, X_train, Y_train, idx_train) = \
prepare_input("train",subdir,shouldRevComp,config.MaxTrain,config)
(X_valid_sequence, X_valid_seq_matrix, X_valid, Y_valid) = \
(X_valid_sequence, X_valid_seq_matrix, X_valid, Y_valid, idx_val) = \
prepare_input("validation",subdir,shouldRevComp,config.MaxTrain,config)
(X_test_sequence, X_test_seq_matrix, X_test, Y_test) = \
(X_test_sequence, X_test_seq_matrix, X_test, Y_test, idx_test) = \
prepare_input("test",subdir,shouldRevComp,config.MaxTest,config) \
if(config.ShouldTest!=0) else (None, None, None, None)
seqlen=X_train.shape[1]

# Build model
model=BuildModel(seqlen)
if pretrained:
print("Loading pretrained model weights from",pretrained,flush=True)
model.load_weights(pretrained)
model.summary()

# Train
Expand All @@ -96,7 +99,8 @@ def main(configFile,subdir,modelFilestem):
numTasks=len(config.Tasks)
for i in range(numTasks):
summary_statistics(X_test,Y_test,"Test",i,numTasks,
config.Tasks[i],model)
config.Tasks[i],model,idx_test,modelFilestem)
print('Min validation loss:', round(min(history.history['val_loss']), 4))

# Report elapsed time
endTime=time.time()
Expand All @@ -106,10 +110,21 @@ def main(configFile,subdir,modelFilestem):



def summary_statistics(X, Y, set, taskNum, numTasks, taskName, model):
def summary_statistics(X, Y, set, taskNum, numTasks, taskName, model, idx, modelFilestem):
pred = model.predict(X, batch_size=config.BatchSize)
cor=naiveCorrelation(Y,pred,taskNum,numTasks)
if (config.useCustomLoss) :
naiveTheta, cor=naiveCorrelation(Y,pred,taskNum,numTasks) # naiveTheta: normal scale, pred: log scale
df = pd.DataFrame({'idx':idx, 'true':tf.math.log(naiveTheta),'predicted': pred.squeeze()}) # log scale
mse = np.mean((df['true'] - df['predicted'])**2)
df.to_csv(modelFilestem+'.txt', index = False, sep='\t')
else:
cor=stats.spearmanr(tf.math.exp(pred.squeeze()),tf.math.exp(Y))
df = pd.DataFrame({'idx':idx, 'true':Y.numpy().ravel(),'predicted': pred.squeeze()}) # log scale
mse = np.mean((df['true'] - df['predicted'])**2)
df.to_csv(modelFilestem+'.txt', index = False, sep='\t')

print(taskName+" rho=",cor.statistic,"p=",cor.pvalue)
print(taskName+' mse=', mse)



Expand All @@ -118,19 +133,19 @@ def naiveCorrelation(y_true, y_pred, taskNum, numTasks):
for i in range(taskNum): a+=NUM_DNA[i]+NUM_RNA[i]
b=a+NUM_DNA[taskNum]
c=b+NUM_RNA[taskNum]
DNA=y_true[:,a:b]+1
RNA=y_true[:,b:c]+1
sumX=tf.reduce_sum(DNA,axis=1)
sumY=tf.reduce_sum(RNA,axis=1)
naiveTheta=sumY/sumX
DNA=y_true[:,a:b] #+1
RNA=y_true[:,b:c] #+1
avgX = tf.reduce_mean(DNA, axis=1)
avgY = tf.reduce_mean(RNA, axis=1)
naiveTheta = avgY / avgX
#print("naiveTheta=",naiveTheta)
#print("y_pred=",tf.math.exp(y_pred[taskNum].squeeze()))
cor=None
if(numTasks==1):
cor=stats.spearmanr(tf.math.exp(y_pred.squeeze()),naiveTheta)
else:
cor=stats.spearmanr(tf.math.exp(y_pred[taskNum].squeeze()),naiveTheta)
return cor
return naiveTheta, cor

#def Spearman(y_true, y_pred):
# return ( tf.py_function(spearmanr, [tf.cast(y_pred, tf.float32),
Expand All @@ -148,27 +163,40 @@ def log(x):
def logGam(x):
return tf.math.lgamma(x)

def logLik(sumX,numX,Yj,logTheta,alpha,beta,numRNA):
def logLik(sumX,numX,Yj,logTheta,alpha,beta,numRNA,sumDnaLibs=None,RnaLibs=None):
n=tf.shape(sumX)[0]
sumX=tf.tile(tf.reshape(sumX,[n,1]),[1,numRNA])
theta=tf.math.exp(logTheta) # assume the model is predicting log(theta)
if not (sumDnaLibs is None or RnaLibs is None):
n=tf.shape(sumDnaLibs)[0]
sumDnaLibs=tf.tile(tf.reshape(sumDnaLibs,[n,1]),[1,numRNA])
libRatio=RnaLibs/(sumDnaLibs/numX)
theta=theta*libRatio
LL=(sumX+alpha)*log(beta+numX)+logGam(Yj+sumX+alpha)+Yj*log(theta)\
-logGam(sumX+alpha)-logGam(Yj+1)-(Yj+sumX+alpha)*log(theta+beta+numX)
return tf.reduce_sum(LL,axis=1) # sum logLik across iid replicates

@tf.autograph.experimental.do_not_convert
def makeClosure(taskNum):
a=0
for i in range(taskNum): a+=NUM_DNA[i]+NUM_RNA[i]
b=a+NUM_DNA[taskNum]
c=b+NUM_RNA[taskNum]
for i in range(taskNum): a+=NUM_DNA[i]+NUM_RNA[i] # skip previous tasks
b=a+NUM_DNA[taskNum] # first column of RNA counts
c=b+NUM_RNA[taskNum] # first column of DNA lib sizes
d=c+NUM_DNA[taskNum] # first column of RNA lib sizes
@tf.autograph.experimental.do_not_convert
def loss(y_true, y_pred):
global EPSILON
DNA=y_true[:,a:b]
RNA=y_true[:,b:c]
sumX=tf.reduce_sum(DNA,axis=1)
LL=-logLik(sumX,b-a,RNA,y_pred,EPSILON,EPSILON,NUM_RNA[taskNum])
if (tf.shape(y_true)[1] > c):
DnaLibs=y_true[:,c:d]
RnaLibs=y_true[:,d:]
sumDnaLibs=tf.reduce_sum(DnaLibs,axis=1)
LL=-logLik(sumX,b-a,RNA,y_pred,EPSILON,EPSILON,NUM_RNA[taskNum],
sumDnaLibs,RnaLibs)
else:
LL=-logLik(sumX,b-a,RNA,y_pred,EPSILON,EPSILON,NUM_RNA[taskNum])
#return tf.reduce_mean(LL,axis=-1) # Put on same scale as MSE
#print("pred=",y_pred)
#print("-LL=",LL)
Expand All @@ -191,11 +219,6 @@ def loss(y_true, y_pred):
naiveTheta=sumY/sumX
mse=tf.math.reduce_mean(tf.math.square(y_pred-tf.math.log(naiveTheta)),
axis=1)
#mse=tf.math.reduce_mean(tf.math.square(tf.math.exp(y_pred)-naiveTheta))
### axis=-1)
print("log(naiveTheta)=",tf.math.log(naiveTheta))
print("pred=",y_pred)
print("mse=",mse)
return mse
#cor=stats.spearmanr(tf.math.exp(y_pred[taskNum].squeeze()),naiveTheta)
#return cor
Expand Down Expand Up @@ -251,6 +274,7 @@ def loadFasta(fasta_path, as_dict=False,uppercase=False, stop_at=None,
rc=generate_complementary_sequence(rec[1])
rec[1]=rec[1]+"NNNNNNNNNNNNNNNNNNNN"+rc
return pd.DataFrame({'location': [e[0] for e in fastas],
'idx': [e[0].split(' ')[0] for e in fastas],
'sequence': [e[1] for e in fastas]})


Expand All @@ -268,9 +292,11 @@ def loadCounts(filename,maxCases,config):
for line in IN:
if type(line) is bytes: line = line.decode("utf-8")
fields=line.rstrip().split()
fields=[int(x) for x in fields]
if(config.useCustomLoss): lines.append(fields)
else: lines.append(computeNaiveTheta(fields,DNAreps,RNAreps))
if(config.useCustomLoss):
lines.append([int(x) for x in fields])
else:
fields=[float(x) for x in fields] # possibly normalized data
lines.append(computeNaiveTheta(fields,DNAreps,RNAreps))
linesRead+=1
if(linesRead>=maxCases): break
lines=np.array(lines)
Expand All @@ -283,12 +309,12 @@ def computeNaiveTheta(line,DNAreps,RNAreps):
for i in range(numTasks):
b=a+DNAreps[i]
c=b+RNAreps[i]
DNA=line[a:b] #+1
RNA=line[b:c] #+1
sumX=sum(DNA)+1
sumY=sum(RNA)+1
naiveTheta=sumY/sumX
rec.append(naiveTheta)
DNA=line[a:b]
RNA=line[b:c]
avgX=sum(DNA)/DNAreps[i]
avgY=sum(RNA)/RNAreps[i] # normalized data
naiveTheta=avgY/avgX
rec.append(tf.math.log(naiveTheta)) # log-scale
a=c
return rec

Expand All @@ -310,7 +336,7 @@ def prepare_input(set,subdir,shouldRevComp,maxCases,config):
NUM_RNA=RNAreps
matrix=pd.DataFrame(Y)
matrix=tf.cast(matrix,tf.float32)
return (input_fasta_data_A.sequence, seq_matrix_A, X_reshaped, matrix)
return (input_fasta_data_A.sequence, seq_matrix_A, X_reshaped, matrix, input_fasta_data_A.idx)


def BuildModel(seqlen):
Expand Down Expand Up @@ -342,13 +368,15 @@ def BuildModel(seqlen):

# Optional attention layers
if(config.NumAttn>0):
x=x+keras_nlp.layers.SinePositionEncoding()(x)
x=x+keras_nlp.layers.RotaryEmbedding(name="rotary_embedding")(x)
for i in range(config.NumAttn):
skip=x
x=LayerNormalization()(x)
x=MultiHeadAttention(num_heads=config.AttnHeads[i],
key_dim=config.AttnKeyDim[i])(x,x)
x=Dropout(config.DropoutRate)(x)
x=LayerNormalization(name=f"layer_norm_{i}")(x)
x = TransformerEncoder(intermediate_dim=config.AttnKeyDim[i],
num_heads=config.AttnHeads[i],
dropout=config.DropoutRate,
name=f"transformer_encoder_{i}")(x)
x=Dropout(config.DropoutRate, name=f"dropout_{i}")(x)
if(config.AttnResidualSkip!=0):
x=Add()([x,skip])

Expand All @@ -363,7 +391,7 @@ def BuildModel(seqlen):
x = Flatten()(x) # Commented out on 3/22/2023

# dense layers
if(config.NumDense>0):
if((config.NumDense>0) and config.PreDenseDropout):
x=Dropout(config.DropoutRate)(x)
for i in range(config.NumDense):
x=kl.Dense(config.DenseSizes[i])(x)
Expand Down Expand Up @@ -403,9 +431,12 @@ def train(model,X_train,Y_train,X_valid,Y_valid):
#=========================================================================
# Command Line Interface
#=========================================================================
if(len(sys.argv)!=4):
exit(ProgramName.get()+" <parms.config> <data-subdir> <out:model-filestem>\n")
(configFile,subdir,modelFilestem)=sys.argv[1:]
main(configFile,subdir,modelFilestem)


import argparse
parser = argparse.ArgumentParser(prog=ProgramName.get(), description="Train a BlueSTARR model.")
parser.add_argument("configFile", help="Configuration file with hyperparameters.")
parser.add_argument("subdir", help="Subdirectory containing training, validation, and test data.")
parser.add_argument("modelFilestem", help="File (and path) stem for saving the model.")
parser.add_argument("--pretrained", help="Load pretrained model weights from this file.")
args = parser.parse_args()

main(args.configFile,args.subdir,args.modelFilestem, pretrained=args.pretrained)
15 changes: 15 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
ARG TENSORFLOW_VERSION=2.15.0

FROM tensorflow/tensorflow:${TENSORFLOW_VERSION}-gpu
ARG TENSORFLOW_VERSION

RUN apt-get update && apt-get install --no-install-recommends -y git

WORKDIR /bluestarr

COPY non-tensorflow-reqs.txt .

# We have to specify the tensorflow version or else keras_nlp will force an upgrade to the latest
# tensorflow version
RUN pip install --no-cache-dir -r non-tensorflow-reqs.txt tensorflow==${TENSORFLOW_VERSION}
COPY *.py .
1 change: 1 addition & 0 deletions K562.config
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ GlobalMaxPool = 0
GlobalAvePool = 1
NumDense = 0
DenseSizes = 0
PreDenseDropout = 1
NumAttentionLayers = 0
AttentionHeads = 0
AttentionKeyDim = 0
Expand Down
20 changes: 14 additions & 6 deletions NeuralConfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,9 @@ def __init__(self,filename):
self.NumDense=int(config.lookupOrDie("NumDense"))
self.DenseSizes=[int(x) for x in config.lookupListOrDie("DenseSizes")]
self.checkSize("DenseSizes",self.DenseSizes,self.NumDense)
self.PreDenseDropout=config.lookup("PreDenseDropout")
if(self.PreDenseDropout is None): self.PreDenseDropout=1 # defaults to true!
else: self.PreDenseDropout=int(self.PreDenseDropout)

# Parameters pertaining to output and loss
self.Tasks=config.lookupList("Tasks")
Expand Down Expand Up @@ -120,12 +123,17 @@ def dump(self):
print("DropoutRate=",self.DropoutRate,sep="")
print("LearningRate=",self.LearningRate,sep="")
print("ConvDropout=",self.ConvDropout,sep="")

print("PreDenseDropout=",self.PreDenseDropout,sep="")

#=========================================================================
# main()
#=========================================================================
#if(len(sys.argv)!=2):
# exit(ProgramName.get()+" <in.config>\n")
#(infile,)=sys.argv[1:]
#config=NeuralConfig(infile)
#config.dump()
def main():
if(len(sys.argv)!=2):
exit(ProgramName.get()+" <in.config>\n")
(infile,)=sys.argv[1:]
config=NeuralConfig(infile)
config.dump()

if __name__ == "__main__":
main()
Loading