Skip to content

Commit f00f153

Browse files
committed
uploaded transcriptome assembly scripts
0 parents  commit f00f153

11 files changed

Lines changed: 248 additions & 0 deletions
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#!/bin/bash
2+
#SBATCH -p hns,spalumbi
3+
#SBATCH --mail-type=FAIL
4+
#################
5+
#set a job name
6+
#SBATCH --job-name=Astroides_GM
7+
#################
8+
#a file for job output, you can check job progress
9+
#SBATCH --output=Astroides_GM
10+
#################
11+
# a file for errors from the job
12+
#SBATCH --error=Astroides_GM.err
13+
#################
14+
#time limit, default is 2 hours
15+
#SBATCH --time 48:0:0
16+
#number of CPUs
17+
#SBATCH -c 16
18+
#amount of RAM, most hns have 124 or 191
19+
#SBATCH --mem=191G
20+
21+
ml biology trinity/2.8.4/
22+
23+
Trinity --seqType fq --max_memory 191G --CPU 16 --min_contig_length 300 --trimmomatic --monitoring --left /scratch/groups/spalumbi/beth/Astroides/raw/fastq/14283X1_1.fq,/scratch/groups/spalumbi/beth/Astroides/raw/fastq/14283X2_1.fq,/scratch/groups/spalumbi/beth/Astroides/raw/fastq/14283X3_1.fq,/scratch/groups/spalumbi/beth/Astroides/raw/fastq/14283X4_1.fq --right /scratch/groups/spalumbi/beth/Astroides/raw/fastq/14283X1_2.fq,/scratch/groups/spalumbi/beth/Astroides/raw/fastq/14283X2_2.fq,/scratch/groups/spalumbi/beth/Astroides/raw/fastq/14283X3_2.fq,/scratch/groups/spalumbi/beth/Astroides/raw/fastq/14283X4_2.fq
24+
25+
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#!/bin/bash
2+
#SBATCH -p hns,spalumbi
3+
#################
4+
#set a job name
5+
#SBATCH --job-name=Astroides_PV
6+
#################
7+
#a file for job output, you can check job progress
8+
#SBATCH --output=Astroides_PV
9+
#################
10+
# a file for errors from the job
11+
#SBATCH --error=Astroides_PV.err
12+
#################
13+
#time limit, default is 2 hours
14+
#SBATCH --time 48:0:0
15+
#number of CPUs
16+
#SBATCH -c 16
17+
#amount of RAM, most hns have 124 or 191
18+
#SBATCH --mem=191G
19+
20+
21+
ml biology trinity/2.8.4/
22+
23+
Trinity --seqType fq --max_memory 191G --CPU 16 --min_contig_length 300 --trimmomatic --monitoring --left /scratch/groups/spalumbi/beth/Astroides/raw/fastq/14283X8_1.fq,/scratch/groups/spalumbi/beth/Astroides/raw/fastq/14283X9_1.fq,/scratch/groups/spalumbi/beth/Astroides/raw/fastq/14283X10_1.fq --right /scratch/groups/spalumbi/beth/Astroides/raw/fastq/14283X8_2.fq,/scratch/groups/spalumbi/beth/Astroides/raw/fastq/14283X9_2.fq,/scratch/groups/spalumbi/beth/Astroides/raw/fastq/14283X10_2.fq
24+
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#!/bin/bash
2+
#SBATCH -p hns,spalumbi
3+
#################
4+
#set a job name
5+
#SBATCH --job-name=Astroides_SA
6+
#################
7+
#a file for job output, you can check job progress
8+
#SBATCH --output=Astroides_SA
9+
#################
10+
# a file for errors from the job
11+
#SBATCH --error=Astroides_SA.err
12+
#################
13+
#time limit, default is 2 hours
14+
#SBATCH --time 48:0:0
15+
#number of CPUs
16+
#SBATCH -c 16
17+
#amount of RAM, most hns have 124 or 191
18+
#SBATCH --mem=191G
19+
20+
21+
ml biology trinity/2.8.4/
22+
23+
Trinity --seqType fq --max_memory 191G --CPU 16 --min_contig_length 300 --trimmomatic --monitoring --left /scratch/groups/spalumbi/beth/Astroides/raw/fastq/14283X5_1.fq,/scratch/groups/spalumbi/beth/Astroides/raw/fastq/14283X6_1.fq,/scratch/groups/spalumbi/beth/Astroides/raw/fastq/14283X7_1.fq --right /scratch/groups/spalumbi/beth/Astroides/raw/fastq/14283X5_2.fq,/scratch/groups/spalumbi/beth/Astroides/raw/fastq/14283X6_2.fq,/scratch/groups/spalumbi/beth/Astroides/raw/fastq/14283X7_2.fq
24+
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#!/bin/bash
2+
#SBATCH -p spalumbi
3+
#################
4+
#set a job name
5+
#SBATCH --job-name=Astroides_SP
6+
#################
7+
#a file for job output, you can check job progress
8+
#SBATCH --output=Astroides_SP
9+
#################
10+
# a file for errors from the job
11+
#SBATCH --error=Astroides_SP.err
12+
#################
13+
#time limit, default is 2 hours
14+
#SBATCH --time 48:0:0
15+
#number of CPUs
16+
#SBATCH -c 16
17+
#amount of RAM, most hns have 124 or 191
18+
#SBATCH --mem=191G
19+
20+
21+
ml biology trinity/2.8.4/
22+
23+
Trinity --seqType fq --max_memory 191G --CPU 16 --min_contig_length 300 --trimmomatic --monitoring --left /scratch/groups/spalumbi/beth/Astroides/raw/14833R/14833X10_1.fq,/scratch/groups/spalumbi/beth/Astroides/raw/14833R/14833X29_1.fq,/scratch/groups/spalumbi/beth/Astroides/raw/14833R/14833X38_1.fq --right /scratch/groups/spalumbi/beth/Astroides/raw/14833R/14833X10_2.fq,/scratch/groups/spalumbi/beth/Astroides/raw/14833R/14833X29_2.fq,/scratch/groups/spalumbi/beth/Astroides/raw/14833R/14833X38_2.fq
24+

scripts/batch-blast-uniprot.sh

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!/bin/bash
2+
#SBATCH -p owners
3+
#This script will split an assembly into batches of 100 contigs and sumbit blastx-uniprot.sh for teach temp file to the cluster
4+
5+
#usage: bash batch-blast-uniprot.sh assembly.fa
6+
7+
awk 'BEGIN {n_seq=0;} /^>/{if(n_seq%100==0){file=sprintf("TEMP_%d.fa",n_seq);} print >> file;n_seq++; next;} { print >> file; }' < $1
8+
9+
10+
for i in TEMP*.fa; do sbatch blastx-uniprot.sh $i ; done
11+
12+
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#!/bin/bash
2+
#USAGE: bash batch-hisat2-fq-paired.sh index 4 *_1.txt.gz
3+
#if you don't have a hisat2 index, build it with "hisat2-build <reference>.fa basename"
4+
module load java
5+
CHUNK=$2
6+
COUNTER=0
7+
FQ="${@:3}"
8+
for i in $FQ; do
9+
if [ $COUNTER -eq 0 ]; then
10+
echo -e "#!/bin/bash\n#SBATCH -p owners\n#SBATCH --ntasks=1\n#SBATCH --cpus-per-task=3\n#SBATCH -t 12:00:00\n#SBATCH --mem 24000" > TEMPBATCH.sbatch; fi
11+
BASE=$(basename $(basename $(basename $( basename $i .gz) .txt) .fq) .fastq)
12+
echo "srun hisat2 --end-to-end --very-sensitive --no-spliced-alignment -p 3 -X 1500 --rg-id $BASE --rg SM:$BASE -x $1 -1 ${BASE}_1.txt.gz -2 ${BASE}_2.txt.gz > $BASE.sam" >> TEMPBATCH.sbatch
13+
echo "samtools view -bSq 10 ${BASE}.sam > ${BASE}_BTVS-UNSORTED.bam " >> TEMPBATCH.sbatch
14+
echo "srun samtools sort ${BASE}_BTVS-UNSORTED.bam > ${BASE}_UNDEDUP.bam" >> TEMPBATCH.sbatch
15+
echo "srun java -Xmx4g -jar /share/PI/spalumbi/programs/picard.jar MarkDuplicates REMOVE_DUPLICATES=true INPUT=${BASE}_UNDEDUP.bam OUTPUT=${BASE}.bam METRICS_FILE=${BASE}-metrics.txt VALIDATION_STRINGENCY=LENIENT" >> TEMPBATCH.sbatch
16+
echo "srun samtools index ${BASE}.bam" >> TEMPBATCH.sbatch
17+
echo "rm ${BASE}.sam" >> TEMPBATCH.sbatch
18+
echo "rm ${BASE}_BTVS-UNSORTED.bam" >> TEMPBATCH.sbatch
19+
echo "rm ${BASE}_UNDEDUP.bam" >> TEMPBATCH.sbatch
20+
let COUNTER=COUNTER+1
21+
if [ $COUNTER -eq $CHUNK ]; then
22+
sbatch TEMPBATCH.sbatch
23+
COUNTER=0; fi
24+
done
25+
if [ $COUNTER -ne 0 ]; then
26+
sbatch TEMPBATCH.sbatch; fi

scripts/bcftools_parallel_eas.sh

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#!/bin/bash
2+
#before running: mkdir vcfout
3+
#usage: bash bcftools_parallel_eas.sh ref.fa vcfout 24 args *.bam
4+
#args are other arguments, for instance "-t DP"
5+
6+
REF=$1
7+
NCPU=$3
8+
VCFOUT=$2
9+
BAMS="${@:4}"
10+
echo $REF
11+
echo $VCFOUT
12+
echo $BAMS
13+
14+
samtools faidx $1
15+
awk '{print $1,"0",$2-1}' ${1}.fai > $VCFOUT/REGIONS.bed
16+
nregions=($(wc -l $VCFOUT/REGIONS.bed))
17+
nlines=$(($nregions / $NCPU))
18+
echo "Splitting into batches of "$nlines
19+
split $VCFOUT/REGIONS.bed -l $nlines $VCFOUT/TEMP-REGIONS
20+
21+
for i in $VCFOUT/TEMP-REGIONS* ; do
22+
echo sending out batch $i
23+
# echo -e "#!/bin/bash\n#SBATCH --time=24:00:00 -p spalumbi,owners" > ${i}.sbatch
24+
echo -e "#!/bin/bash\n#SBATCH --time=24:00:00 -p owners" > ${i}.sbatch
25+
echo "samtools mpileup -l $i -t AD -ugf $REF $BAMS | bcftools call -vmO v > ${i}.vcf" >> ${i}.sbatch
26+
sbatch ${i}.sbatch
27+
done

scripts/blastx-uniprot.sh

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#!/bin/bash
2+
#SBATCH -p hns,owners
3+
#SBATCH --time=48:00:00
4+
#SBATCH --mem=32000
5+
#SBATCH --ntasks=1
6+
#SBATCH --cpus-per-task=6
7+
########################
8+
# -outfmt 5
9+
# to call: sbatch blastx-uniprot.sh input.fa
10+
11+
#this echoes your TEMP file name to the slurm output in case any files are aborted on owners nodes or have errors
12+
echo $1
13+
14+
#blast against uniprot_db
15+
#remember to update the uniprot database on sherlock regularly
16+
17+
ml biology ncbi-blast+/2.7.1
18+
blastx -db /scratch/groups/spalumbi/BLAST_db/uniprot_trembl/uniprot_sprot_trembl_April2019 -query $1 -out $1.blast.out -evalue 0.001 -max_hsps 1 -num_threads 6 -outfmt 5
19+

scripts/busco.sh

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#!/bin/bash
2+
#SBATCH -p owners,spalumbi
3+
#SBATCH -c 4
4+
#SBATCH --time 24:0:0
5+
6+
#created by Beth March 2019
7+
#before using this script, you need add this path to the configfile in your home .bashrc file:
8+
# export BUSCO_CONFIG_FILE="/home/groups/spalumbi/programs/busco-master/config/config.ini"
9+
10+
#usage
11+
#busco.sh inputfile outputfile genome
12+
13+
#genome= genome, transcriptome, proteins
14+
# I didn't install the Augustus package needed for genome assessment yet. You will need to do that to use this script on a genome.
15+
16+
17+
ml biology py-busco
18+
run_BUSCO.py -i $1 -o $2 -m $3 -l /home/groups/spalumbi/programs/busco-master/metazoa_odb9/
19+

scripts/cap3.sh

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#!/bin/bash
2+
3+
#SBATCH --time=8:00:00
4+
#SBATCH -p spalumbi,hns,owners
5+
#SBATCH --mem=181GB
6+
7+
### OPTIONS FOR SHORT READS
8+
9+
# -i specify segment pair score cutoff N > 20 (40)
10+
#The -i option is used to specify a score cutoff on segment pairs (ungapped alignments).
11+
#The score of a segment pair with 19 base matches and 1 base mismatch is 2 * 19 + (-5) * 1 = 33,
12+
# where each base match is given a score of 2 and each mismatch is given a score of -5.
13+
14+
# -j specify chain score cutoff N > 30 (80)
15+
#The -j option is used to specify a score cutoff on chains of segment pairs,
16+
#where the score of a chain is the sum of scores of each segment pair
17+
#minus penalties for gaps between segment pairs.
18+
#The score of a chain consisting of one segment pair is simply the score of
19+
#the segment pair.
20+
21+
# -o specify overlap length cutoff > 15 (40)
22+
# -s specify overlap similarity score cutoff N > 250 (900)
23+
#The -o option is used to specify a length cutoff on overlaps,
24+
#whereas the -s option is used to specify a score cutoff (based on matches, mismatches, and gaps) on overlaps.
25+
26+
# -p specify overlap percent identity cutoff N > 65 (90)
27+
28+
#to call: sbatch cap3.sh <yourassembly>.fasta
29+
30+
cap3 $1
31+
32+
#recommended for short reads
33+
#-i 30 -j 31 -o 18 -s 300

0 commit comments

Comments
 (0)