Overall of OphnuRNASeq (from Nigg et al 2016) pipeline version 1: Jan 2019
Teeratas Kijpornyongpan
Start from loading SRA raw reads --> then do data cleaning (Trimmomatics)
Do tophat to map reads to the reference genomes --> Then ht-seq count to get known transcripts from .gff annotation

#Jan 7, 2019 --> Found from ht-seq count results that many reads are mapped to no feature
#            --> think about mapping reads to de novo transcriptome assembly in the future, but not for now.
#########################################################################
#####Script to download raw .fastq files from Nigg2016 Ophiostoma paper #####
#PBS -q mycospace
#PBS -l nodes=1:ppn=1
#PBS -l walltime=150:00:00

module load perl

cd $PBS_O_WORKDIR

fastq-dump -Z SRR3929074 > SRR3929074.fastq

efetch -db sra -id 'SRR3929073' >> PRJNA325932_metadata.xml
fastq-dump -Z SRR3929073 > SRR3929073.fastq

efetch -db sra -id 'SRR3929072' >> PRJNA325932_metadata.xml
fastq-dump -Z SRR3929072 > SRR3929072.fastq

efetch -db sra -id 'SRR3929071' >> PRJNA325932_metadata.xml
fastq-dump -Z SRR3929071 > SRR3929071.fastq

efetch -db sra -id 'SRR3929070' >> PRJNA325932_metadata.xml
fastq-dump -Z SRR3929070 > SRR3929070.fastq

efetch -db sra -id 'SRR3929069' >> PRJNA325932_metadata.xml
fastq-dump -Z SRR3929069 > SRR3929069.fastq

efetch -db sra -id 'SRR3929068' >> PRJNA325932_metadata.xml
fastq-dump -Z SRR3929068 > SRR3929068.fastq

efetch -db sra -id 'SRR3929067' >> PRJNA325932_metadata.xml
fastq-dump -Z SRR3929067 > SRR3929067.fastq

efetch -db sra -id 'SRR3929066' >> PRJNA325932_metadata.xml
fastq-dump -Z SRR3929066 > SRR3929066.fastq

efetch -db sra -id 'SRR3929065' >> PRJNA325932_metadata.xml
fastq-dump -Z SRR3929065 > SRR3929065.fastq

efetch -db sra -id 'SRR3929064' >> PRJNA325932_metadata.xml
fastq-dump -Z SRR3929064 > SRR3929064.fastq

efetch -db sra -id 'SRR3929063' >> PRJNA325932_metadata.xml
fastq-dump -Z SRR3929063 > SRR3929063.fastq

efetch -db sra -id 'SRR3929062' >> PRJNA325932_metadata.xml
fastq-dump -Z SRR3929062 > SRR3929062.fastq

efetch -db sra -id 'SRR3929061' >> PRJNA325932_metadata.xml
fastq-dump -Z SRR3929061 > SRR3929061.fastq

efetch -db sra -id 'SRR3929060' >> PRJNA325932_metadata.xml
fastq-dump -Z SRR3929060 > SRR3929060.fastq

efetch -db sra -id 'SRR3929059' >> PRJNA325932_metadata.xml
fastq-dump -Z SRR3929059 > SRR3929059.fastq

efetch -db sra -id 'SRR3929058' >> PRJNA325932_metadata.xml
fastq-dump -Z SRR3929058 > SRR3929058.fastq

cat Ophiostoma_download.sub

Jobfile fastqc --> check quality of Illumina reads using fastqc. Also, compare quality before/after Trimmomatics
#Job takes less than an hour!
Job started
10 January 2019 17:19:46
Analysis complete for SRR3929058.fastq
Analysis complete for SRR3929061.fastq
Analysis complete for SRR3929062.fastq
Analysis complete for SRR3929067.fastq
Analysis complete for SRR3929071.fastq
Analysis complete for SRR3929072.fastq
Job ended
10 January 2019 17:27:00

#PBS -q mycospace
#PBS -l nodes=1:ppn=1
#PBS -l walltime=40:00:00
#PBS -N Ophnu_fastqc

module use /apps/group/bioinformatics/modules
module load fastqc
cd $PBS_O_WORKDIR

echo "Job started"
date +"%d %B %Y %H:%M:%S"

mkdir SRR3929058_fastqc
fastqc SRR3929058.fastq  --o SRR3929058_fastqc
mkdir SRR3929061_fastqc
fastqc SRR3929061.fastq  --o SRR3929061_fastqc
mkdir SRR3929062_fastqc
fastqc SRR3929062.fastq  --o SRR3929062_fastqc
mkdir SRR3929067_fastqc
fastqc SRR3929067.fastq  --o SRR3929067_fastqc
mkdir SRR3929071_fastqc
fastqc SRR3929071.fastq  --o SRR3929071_fastqc
mkdir SRR3929072_fastqc
fastqc SRR3929072.fastq  --o SRR3929072_fastqc

echo "Job ended"
date +"%d %B %Y %H:%M:%S"
cat Ophnu_fastqc.sub

#########################################################################

Jobfile Trimmomatics
#Do Trimmomatics for Illumina raw reads from Nigg et al. 2016. Base qualities are slightly improved.
#LEADING:20 TRAILING:20 MINLEN:30 ILLUMINACLIP:TruSeq3-PE-2.fa:2:40:10 are preset parameters according to Genomics Core.
#It takes less than 10 mins to finish Trimmomatic for each sample.

#PBS -q mycospace
#PBS -l walltime=200:00:00
#PBS -l nodes=1:ppn=4

module load java
module load trimmomatic/0.36
module load fastqc

echo "Job started"
date +"%d %B %Y %H:%M:%S"

cd $PBS_O_WORKDIR

java -classpath /group/bioinfo/apps/apps/trimmomatic-0.36/trimmomatic-0.36.jar org.usadellab.trimmomatic.TrimmomaticSE  \
-threads 4 -phred33 SRR3929058.fastq ./Ophnu_YtoH_0h_rep1_trimmed.fastq LEADING:20 TRAILING:20 MINLEN:30 ILLUMINACLIP:TruSeq3-PE-2.fa:2:40:10
fastqc Ophnu_YtoH_0h_rep1_trimmed.fastq  --o Ophnu_YtoH_0h_rep1_trimmed_fastqc


java -classpath /group/bioinfo/apps/apps/trimmomatic-0.36/trimmomatic-0.36.jar org.usadellab.trimmomatic.TrimmomaticSE  \
-threads 4 -phred33 SRR3929072.fastq ./Ophnu_YtoH_0h_rep2_trimmed.fastq LEADING:20 TRAILING:20 MINLEN:30 ILLUMINACLIP:TruSeq3-PE-2.fa:2:40:10
fastqc Ophnu_YtoH_0h_rep2_trimmed.fastq  --o Ophnu_YtoH_0h_rep2_trimmed_fastqc


java -classpath /group/bioinfo/apps/apps/trimmomatic-0.36/trimmomatic-0.36.jar org.usadellab.trimmomatic.TrimmomaticSE  \
-threads 4 -phred33 SRR3929062.fastq ./Ophnu_YtoH_0h_rep3_trimmed.fastq LEADING:20 TRAILING:20 MINLEN:30 ILLUMINACLIP:TruSeq3-PE-2.fa:2:40:10
fastqc Ophnu_YtoH_0h_rep3_trimmed.fastq  --o Ophnu_YtoH_0h_rep3_trimmed_fastqc


java -classpath /group/bioinfo/apps/apps/trimmomatic-0.36/trimmomatic-0.36.jar org.usadellab.trimmomatic.TrimmomaticSE  \
-threads 4 -phred33 SRR3929071.fastq ./Ophnu_YtoH_27h_rep1_trimmed.fastq LEADING:20 TRAILING:20 MINLEN:30 ILLUMINACLIP:TruSeq3-PE-2.fa:2:40:10
fastqc Ophnu_YtoH_27h_rep1_trimmed.fastq  --o Ophnu_YtoH_27h_rep1_trimmed_fastqc


java -classpath /group/bioinfo/apps/apps/trimmomatic-0.36/trimmomatic-0.36.jar org.usadellab.trimmomatic.TrimmomaticSE  \
-threads 4 -phred33 SRR3929061.fastq ./Ophnu_YtoH_27h_rep2_trimmed.fastq LEADING:20 TRAILING:20 MINLEN:30 ILLUMINACLIP:TruSeq3-PE-2.fa:2:40:10
fastqc Ophnu_YtoH_27h_rep2_trimmed.fastq  --o Ophnu_YtoH_27h_rep2_trimmed_fastqc


java -classpath /group/bioinfo/apps/apps/trimmomatic-0.36/trimmomatic-0.36.jar org.usadellab.trimmomatic.TrimmomaticSE  \
-threads 4 -phred33 SRR3929067.fastq ./Ophnu_YtoH_27h_rep3_trimmed.fastq LEADING:20 TRAILING:20 MINLEN:30 ILLUMINACLIP:TruSeq3-PE-2.fa:2:40:10
fastqc Ophnu_YtoH_27h_rep3_trimmed.fastq  --o Ophnu_YtoH_27h_rep3_trimmed_fastqc

echo "Job done"
date +"%d %B %Y %H:%M:%S"
cat Trimmomatic_20181126.sub

#########################################################################
#####Notes from Purdue Genomics Core#####
Unaligned_filtered We run either Trimmomatic and/or cutadapt in order 
 to remove adapters. Poor quality bases (default of less than Phred-20)
 are removed (clipped) from both the 5' and 3' ends of the Unaligned 
 reads. After adapter removal and quality clipping any reads below a 
 minimum length (default 30 bases) are discarded. The Unaligned_filtered
 reads are the ones most often used for further processing. 
 The filtered reads for a sample are put all together into one large 
 file per read direction. You can access these either via the given link
 the 'Unaligned_filtered' directory(s) below or via the per-sample 
 directory. For paired-end reads it is possible that one of the reads 
 in a pair is still viable while the other is not. Viable now-single-end
 reads are put in the 'Unaligned_filtered_unpaired' directory. 
For paired-end (99% of the reads we do), the code used is: trimmomatic PE
 LEADING:20 TRAILING:20 MINLEN:30 ILLUMINACLIP:our_vector_file:2:40:10
#########################################################################

####Start from mapping reads that have adapters and low base quality trimmed, use tophat to map reads #####
Jobfile TopHat   ##It takes about 30 mins to finish tophat for each sample.

#PBS -q mycospace
#PBS -l walltime=70:00:00
#PBS -l nodes=1:ppn=4

echo "Job started"
date +"%d %B %Y %H:%M:%S"

cd $PBS_O_WORKDIR
module load bowtie2/2.2.9
module load perl
###Build genome references ###
bowtie2-build Ophnu1_Repeatmasked.fasta Ophnu1_Genome_Ref

###Then, do tophat ###

module load tophat/2.1.1
module load samtools/1.9

tophat -p 4 -o Ophnu_0h_rep1_tophat ./Ophnu1_Genome_Ref Ophnu_YtoH_0h_rep1_trimmed.fastq
cd Ophnu_0h_rep1_tophat
mv accepted_hits.bam Ophnu_0h_rep1.bam
mv Ophnu_0h_rep1.bam ..
cd ..


tophat -p 4 -o Ophnu_0h_rep2_tophat ./Ophnu1_Genome_Ref Ophnu_YtoH_0h_rep2_trimmed.fastq
cd Ophnu_0h_rep2_tophat
mv accepted_hits.bam Ophnu_0h_rep2.bam
mv Ophnu_0h_rep2.bam ..
cd ..


tophat -p 4 -o Ophnu_0h_rep3_tophat ./Ophnu1_Genome_Ref Ophnu_YtoH_0h_rep3_trimmed.fastq
cd Ophnu_0h_rep3_tophat
mv accepted_hits.bam Ophnu_0h_rep3.bam
mv Ophnu_0h_rep3.bam ..
cd ..


tophat -p 4 -o Ophnu_27h_rep1_tophat ./Ophnu1_Genome_Ref Ophnu_YtoH_27h_rep1_trimmed.fastq
cd Ophnu_27h_rep1_tophat
mv accepted_hits.bam Ophnu_27h_rep1.bam
mv Ophnu_27h_rep1.bam ..
cd ..


tophat -p 4 -o Ophnu_27h_rep2_tophat ./Ophnu1_Genome_Ref Ophnu_YtoH_27h_rep2_trimmed.fastq
cd Ophnu_27h_rep2_tophat
mv accepted_hits.bam Ophnu_27h_rep2.bam
mv Ophnu_27h_rep2.bam ..
cd ..


tophat -p 4 -o Ophnu_27h_rep3_tophat ./Ophnu1_Genome_Ref Ophnu_YtoH_27h_rep3_trimmed.fastq
cd Ophnu_27h_rep3_tophat
mv accepted_hits.bam Ophnu_27h_rep3.bam
mv Ophnu_27h_rep3.bam ..
cd ..

echo "Job done"
date +"%d %B %Y %H:%M:%S"

cat tophat_20190111.sub
## Do Tophat for the cleaned Illumina reads of Ophnu: 6 samples --> 0h and 27h, 3 reps each ##

#########################################################################
#####Summary on TopHat #####

Ophnu_0h_rep1 TopHat
Reads:
          Input     :  11416840
           Mapped   :   6745116 (59.1% of input)
            of these:     14813 ( 0.2%) have multiple alignments (0 have >20)
59.1% overall read mapping rate.


Ophnu_0h_rep2 TopHat
Reads:
          Input     :   7036615
           Mapped   :   3645220 (51.8% of input)
            of these:      6525 ( 0.2%) have multiple alignments (1 have >20)
51.8% overall read mapping rate.


Ophnu_0h_rep3 TopHat
Reads:
          Input     :  11219931
           Mapped   :   7009633 (62.5% of input)
            of these:     14952 ( 0.2%) have multiple alignments (0 have >20)
62.5% overall read mapping rate.


Ophnu_27h_rep1 TopHat
Reads:
          Input     :   8598352
           Mapped   :   5145382 (59.8% of input)
            of these:      8878 ( 0.2%) have multiple alignments (0 have >20)
59.8% overall read mapping rate.


Ophnu_27h_rep2 TopHat
Reads:
          Input     :   6855724
           Mapped   :   3550559 (51.8% of input)
            of these:      6307 ( 0.2%) have multiple alignments (0 have >20)
51.8% overall read mapping rate.


Ophnu_27h_rep3 TopHat
Reads:
          Input     :   7438742
           Mapped   :   4533933 (61.0% of input)
            of these:      7253 ( 0.2%) have multiple alignments (0 have >20)
61.0% overall read mapping rate.
#########################################################################

#####Then, transform .bam file into read counts using htseq #####
Jobfile htseq for Ophnu

#PBS -q mycospace
#PBS -l walltime=30:00:00
#PBS -l nodes=1:ppn=1
#PBS -N htseq_Ophnu
echo "Job started"
date +"%d %B %Y %H:%M:%S"

cd $PBS_O_WORKDIR
module load samtools/1.9
module load biopython/2.7.12
module load htseq/0.7.0

####Read counts using htseq --> take around one hour-ish for each .bam file#####
##Will start by sorting alignment using samtools, then go for htseq count ##
## -t CDS means collects the CDS data, -i proteinId is a unique identifier in .gff file, -f bam specifies an input file as a .bam file
## then go for input alignment file and .gff file, ending with > [output file]

samtools sort -o Ophnu_0h_rep1.sorted.bam Ophnu_0h_rep1.bam
htseq-count -t gene -i proteinId  --stranded=no -f bam Ophnu_0h_rep1.sorted.bam Ophnu1_Gene_model.gff3 >Ophnu_0h_rep1.count

samtools sort -o Ophnu_0h_rep2.sorted.bam Ophnu_0h_rep2.bam
htseq-count -t gene -i proteinId  --stranded=no -f bam Ophnu_0h_rep2.sorted.bam Ophnu1_Gene_model.gff3 >Ophnu_0h_rep2.count

samtools sort -o Ophnu_0h_rep3.sorted.bam Ophnu_0h_rep3.bam
htseq-count -t gene -i proteinId  --stranded=no -f bam Ophnu_0h_rep3.sorted.bam Ophnu1_Gene_model.gff3 >Ophnu_0h_rep3.count

samtools sort -o Ophnu_27h_rep1.sorted.bam Ophnu_27h_rep1.bam
htseq-count -t gene -i proteinId  --stranded=no -f bam Ophnu_27h_rep1.sorted.bam Ophnu1_Gene_model.gff3 >Ophnu_27h_rep1.count

samtools sort -o Ophnu_27h_rep2.sorted.bam Ophnu_27h_rep2.bam
htseq-count -t gene -i proteinId  --stranded=no -f bam Ophnu_27h_rep2.sorted.bam Ophnu1_Gene_model.gff3 >Ophnu_27h_rep2.count

samtools sort -o Ophnu_27h_rep3.sorted.bam Ophnu_27h_rep3.bam
htseq-count -t gene -i proteinId  --stranded=no -f bam Ophnu_27h_rep3.sorted.bam Ophnu1_Gene_model.gff3 >Ophnu_27h_rep3.count

echo "Job done"
date +"%d %B %Y %H:%M:%S"
cat htseq_20190111.sub

#########################################################################
Results from htseq-count
Ophnu
Proportion of ambiguous alignment: 0.007 - 0.011%
Proportion of reads not have a unique aligment: 0.34 - 0.49% 
Proportion of reads mapped to no features: 6.8 - 13.8%
#########################################################################