Overall of OphnuRNASeq (from Nigg et al 2016) pipeline version 1: Jan 2019 Teeratas Kijpornyongpan Start from loading SRA raw reads --> then do data cleaning (Trimmomatics) Do tophat to map reads to the reference genomes --> Then ht-seq count to get known transcripts from .gff annotation #Jan 7, 2019 --> Found from ht-seq count results that many reads are mapped to no feature # --> think about mapping reads to de novo transcriptome assembly in the future, but not for now. ######################################################################### #####Script to download raw .fastq files from Nigg2016 Ophiostoma paper ##### #PBS -q mycospace #PBS -l nodes=1:ppn=1 #PBS -l walltime=150:00:00 module load perl cd $PBS_O_WORKDIR fastq-dump -Z SRR3929074 > SRR3929074.fastq efetch -db sra -id 'SRR3929073' >> PRJNA325932_metadata.xml fastq-dump -Z SRR3929073 > SRR3929073.fastq efetch -db sra -id 'SRR3929072' >> PRJNA325932_metadata.xml fastq-dump -Z SRR3929072 > SRR3929072.fastq efetch -db sra -id 'SRR3929071' >> PRJNA325932_metadata.xml fastq-dump -Z SRR3929071 > SRR3929071.fastq efetch -db sra -id 'SRR3929070' >> PRJNA325932_metadata.xml fastq-dump -Z SRR3929070 > SRR3929070.fastq efetch -db sra -id 'SRR3929069' >> PRJNA325932_metadata.xml fastq-dump -Z SRR3929069 > SRR3929069.fastq efetch -db sra -id 'SRR3929068' >> PRJNA325932_metadata.xml fastq-dump -Z SRR3929068 > SRR3929068.fastq efetch -db sra -id 'SRR3929067' >> PRJNA325932_metadata.xml fastq-dump -Z SRR3929067 > SRR3929067.fastq efetch -db sra -id 'SRR3929066' >> PRJNA325932_metadata.xml fastq-dump -Z SRR3929066 > SRR3929066.fastq efetch -db sra -id 'SRR3929065' >> PRJNA325932_metadata.xml fastq-dump -Z SRR3929065 > SRR3929065.fastq efetch -db sra -id 'SRR3929064' >> PRJNA325932_metadata.xml fastq-dump -Z SRR3929064 > SRR3929064.fastq efetch -db sra -id 'SRR3929063' >> PRJNA325932_metadata.xml fastq-dump -Z SRR3929063 > SRR3929063.fastq efetch -db sra -id 'SRR3929062' >> PRJNA325932_metadata.xml fastq-dump -Z SRR3929062 > SRR3929062.fastq efetch -db sra -id 'SRR3929061' >> PRJNA325932_metadata.xml fastq-dump -Z SRR3929061 > SRR3929061.fastq efetch -db sra -id 'SRR3929060' >> PRJNA325932_metadata.xml fastq-dump -Z SRR3929060 > SRR3929060.fastq efetch -db sra -id 'SRR3929059' >> PRJNA325932_metadata.xml fastq-dump -Z SRR3929059 > SRR3929059.fastq efetch -db sra -id 'SRR3929058' >> PRJNA325932_metadata.xml fastq-dump -Z SRR3929058 > SRR3929058.fastq cat Ophiostoma_download.sub Jobfile fastqc --> check quality of Illumina reads using fastqc. Also, compare quality before/after Trimmomatics #Job takes less than an hour! Job started 10 January 2019 17:19:46 Analysis complete for SRR3929058.fastq Analysis complete for SRR3929061.fastq Analysis complete for SRR3929062.fastq Analysis complete for SRR3929067.fastq Analysis complete for SRR3929071.fastq Analysis complete for SRR3929072.fastq Job ended 10 January 2019 17:27:00 #PBS -q mycospace #PBS -l nodes=1:ppn=1 #PBS -l walltime=40:00:00 #PBS -N Ophnu_fastqc module use /apps/group/bioinformatics/modules module load fastqc cd $PBS_O_WORKDIR echo "Job started" date +"%d %B %Y %H:%M:%S" mkdir SRR3929058_fastqc fastqc SRR3929058.fastq --o SRR3929058_fastqc mkdir SRR3929061_fastqc fastqc SRR3929061.fastq --o SRR3929061_fastqc mkdir SRR3929062_fastqc fastqc SRR3929062.fastq --o SRR3929062_fastqc mkdir SRR3929067_fastqc fastqc SRR3929067.fastq --o SRR3929067_fastqc mkdir SRR3929071_fastqc fastqc SRR3929071.fastq --o SRR3929071_fastqc mkdir SRR3929072_fastqc fastqc SRR3929072.fastq --o SRR3929072_fastqc echo "Job ended" date +"%d %B %Y %H:%M:%S" cat Ophnu_fastqc.sub ######################################################################### Jobfile Trimmomatics #Do Trimmomatics for Illumina raw reads from Nigg et al. 2016. Base qualities are slightly improved. #LEADING:20 TRAILING:20 MINLEN:30 ILLUMINACLIP:TruSeq3-PE-2.fa:2:40:10 are preset parameters according to Genomics Core. #It takes less than 10 mins to finish Trimmomatic for each sample. #PBS -q mycospace #PBS -l walltime=200:00:00 #PBS -l nodes=1:ppn=4 module load java module load trimmomatic/0.36 module load fastqc echo "Job started" date +"%d %B %Y %H:%M:%S" cd $PBS_O_WORKDIR java -classpath /group/bioinfo/apps/apps/trimmomatic-0.36/trimmomatic-0.36.jar org.usadellab.trimmomatic.TrimmomaticSE \ -threads 4 -phred33 SRR3929058.fastq ./Ophnu_YtoH_0h_rep1_trimmed.fastq LEADING:20 TRAILING:20 MINLEN:30 ILLUMINACLIP:TruSeq3-PE-2.fa:2:40:10 fastqc Ophnu_YtoH_0h_rep1_trimmed.fastq --o Ophnu_YtoH_0h_rep1_trimmed_fastqc java -classpath /group/bioinfo/apps/apps/trimmomatic-0.36/trimmomatic-0.36.jar org.usadellab.trimmomatic.TrimmomaticSE \ -threads 4 -phred33 SRR3929072.fastq ./Ophnu_YtoH_0h_rep2_trimmed.fastq LEADING:20 TRAILING:20 MINLEN:30 ILLUMINACLIP:TruSeq3-PE-2.fa:2:40:10 fastqc Ophnu_YtoH_0h_rep2_trimmed.fastq --o Ophnu_YtoH_0h_rep2_trimmed_fastqc java -classpath /group/bioinfo/apps/apps/trimmomatic-0.36/trimmomatic-0.36.jar org.usadellab.trimmomatic.TrimmomaticSE \ -threads 4 -phred33 SRR3929062.fastq ./Ophnu_YtoH_0h_rep3_trimmed.fastq LEADING:20 TRAILING:20 MINLEN:30 ILLUMINACLIP:TruSeq3-PE-2.fa:2:40:10 fastqc Ophnu_YtoH_0h_rep3_trimmed.fastq --o Ophnu_YtoH_0h_rep3_trimmed_fastqc java -classpath /group/bioinfo/apps/apps/trimmomatic-0.36/trimmomatic-0.36.jar org.usadellab.trimmomatic.TrimmomaticSE \ -threads 4 -phred33 SRR3929071.fastq ./Ophnu_YtoH_27h_rep1_trimmed.fastq LEADING:20 TRAILING:20 MINLEN:30 ILLUMINACLIP:TruSeq3-PE-2.fa:2:40:10 fastqc Ophnu_YtoH_27h_rep1_trimmed.fastq --o Ophnu_YtoH_27h_rep1_trimmed_fastqc java -classpath /group/bioinfo/apps/apps/trimmomatic-0.36/trimmomatic-0.36.jar org.usadellab.trimmomatic.TrimmomaticSE \ -threads 4 -phred33 SRR3929061.fastq ./Ophnu_YtoH_27h_rep2_trimmed.fastq LEADING:20 TRAILING:20 MINLEN:30 ILLUMINACLIP:TruSeq3-PE-2.fa:2:40:10 fastqc Ophnu_YtoH_27h_rep2_trimmed.fastq --o Ophnu_YtoH_27h_rep2_trimmed_fastqc java -classpath /group/bioinfo/apps/apps/trimmomatic-0.36/trimmomatic-0.36.jar org.usadellab.trimmomatic.TrimmomaticSE \ -threads 4 -phred33 SRR3929067.fastq ./Ophnu_YtoH_27h_rep3_trimmed.fastq LEADING:20 TRAILING:20 MINLEN:30 ILLUMINACLIP:TruSeq3-PE-2.fa:2:40:10 fastqc Ophnu_YtoH_27h_rep3_trimmed.fastq --o Ophnu_YtoH_27h_rep3_trimmed_fastqc echo "Job done" date +"%d %B %Y %H:%M:%S" cat Trimmomatic_20181126.sub ######################################################################### #####Notes from Purdue Genomics Core##### Unaligned_filtered We run either Trimmomatic and/or cutadapt in order to remove adapters. Poor quality bases (default of less than Phred-20) are removed (clipped) from both the 5' and 3' ends of the Unaligned reads. After adapter removal and quality clipping any reads below a minimum length (default 30 bases) are discarded. The Unaligned_filtered reads are the ones most often used for further processing. The filtered reads for a sample are put all together into one large file per read direction. You can access these either via the given link the 'Unaligned_filtered' directory(s) below or via the per-sample directory. For paired-end reads it is possible that one of the reads in a pair is still viable while the other is not. Viable now-single-end reads are put in the 'Unaligned_filtered_unpaired' directory. For paired-end (99% of the reads we do), the code used is: trimmomatic PE LEADING:20 TRAILING:20 MINLEN:30 ILLUMINACLIP:our_vector_file:2:40:10 ######################################################################### ####Start from mapping reads that have adapters and low base quality trimmed, use tophat to map reads ##### Jobfile TopHat ##It takes about 30 mins to finish tophat for each sample. #PBS -q mycospace #PBS -l walltime=70:00:00 #PBS -l nodes=1:ppn=4 echo "Job started" date +"%d %B %Y %H:%M:%S" cd $PBS_O_WORKDIR module load bowtie2/2.2.9 module load perl ###Build genome references ### bowtie2-build Ophnu1_Repeatmasked.fasta Ophnu1_Genome_Ref ###Then, do tophat ### module load tophat/2.1.1 module load samtools/1.9 tophat -p 4 -o Ophnu_0h_rep1_tophat ./Ophnu1_Genome_Ref Ophnu_YtoH_0h_rep1_trimmed.fastq cd Ophnu_0h_rep1_tophat mv accepted_hits.bam Ophnu_0h_rep1.bam mv Ophnu_0h_rep1.bam .. cd .. tophat -p 4 -o Ophnu_0h_rep2_tophat ./Ophnu1_Genome_Ref Ophnu_YtoH_0h_rep2_trimmed.fastq cd Ophnu_0h_rep2_tophat mv accepted_hits.bam Ophnu_0h_rep2.bam mv Ophnu_0h_rep2.bam .. cd .. tophat -p 4 -o Ophnu_0h_rep3_tophat ./Ophnu1_Genome_Ref Ophnu_YtoH_0h_rep3_trimmed.fastq cd Ophnu_0h_rep3_tophat mv accepted_hits.bam Ophnu_0h_rep3.bam mv Ophnu_0h_rep3.bam .. cd .. tophat -p 4 -o Ophnu_27h_rep1_tophat ./Ophnu1_Genome_Ref Ophnu_YtoH_27h_rep1_trimmed.fastq cd Ophnu_27h_rep1_tophat mv accepted_hits.bam Ophnu_27h_rep1.bam mv Ophnu_27h_rep1.bam .. cd .. tophat -p 4 -o Ophnu_27h_rep2_tophat ./Ophnu1_Genome_Ref Ophnu_YtoH_27h_rep2_trimmed.fastq cd Ophnu_27h_rep2_tophat mv accepted_hits.bam Ophnu_27h_rep2.bam mv Ophnu_27h_rep2.bam .. cd .. tophat -p 4 -o Ophnu_27h_rep3_tophat ./Ophnu1_Genome_Ref Ophnu_YtoH_27h_rep3_trimmed.fastq cd Ophnu_27h_rep3_tophat mv accepted_hits.bam Ophnu_27h_rep3.bam mv Ophnu_27h_rep3.bam .. cd .. echo "Job done" date +"%d %B %Y %H:%M:%S" cat tophat_20190111.sub ## Do Tophat for the cleaned Illumina reads of Ophnu: 6 samples --> 0h and 27h, 3 reps each ## ######################################################################### #####Summary on TopHat ##### Ophnu_0h_rep1 TopHat Reads: Input : 11416840 Mapped : 6745116 (59.1% of input) of these: 14813 ( 0.2%) have multiple alignments (0 have >20) 59.1% overall read mapping rate. Ophnu_0h_rep2 TopHat Reads: Input : 7036615 Mapped : 3645220 (51.8% of input) of these: 6525 ( 0.2%) have multiple alignments (1 have >20) 51.8% overall read mapping rate. Ophnu_0h_rep3 TopHat Reads: Input : 11219931 Mapped : 7009633 (62.5% of input) of these: 14952 ( 0.2%) have multiple alignments (0 have >20) 62.5% overall read mapping rate. Ophnu_27h_rep1 TopHat Reads: Input : 8598352 Mapped : 5145382 (59.8% of input) of these: 8878 ( 0.2%) have multiple alignments (0 have >20) 59.8% overall read mapping rate. Ophnu_27h_rep2 TopHat Reads: Input : 6855724 Mapped : 3550559 (51.8% of input) of these: 6307 ( 0.2%) have multiple alignments (0 have >20) 51.8% overall read mapping rate. Ophnu_27h_rep3 TopHat Reads: Input : 7438742 Mapped : 4533933 (61.0% of input) of these: 7253 ( 0.2%) have multiple alignments (0 have >20) 61.0% overall read mapping rate. ######################################################################### #####Then, transform .bam file into read counts using htseq ##### Jobfile htseq for Ophnu #PBS -q mycospace #PBS -l walltime=30:00:00 #PBS -l nodes=1:ppn=1 #PBS -N htseq_Ophnu echo "Job started" date +"%d %B %Y %H:%M:%S" cd $PBS_O_WORKDIR module load samtools/1.9 module load biopython/2.7.12 module load htseq/0.7.0 ####Read counts using htseq --> take around one hour-ish for each .bam file##### ##Will start by sorting alignment using samtools, then go for htseq count ## ## -t CDS means collects the CDS data, -i proteinId is a unique identifier in .gff file, -f bam specifies an input file as a .bam file ## then go for input alignment file and .gff file, ending with > [output file] samtools sort -o Ophnu_0h_rep1.sorted.bam Ophnu_0h_rep1.bam htseq-count -t gene -i proteinId --stranded=no -f bam Ophnu_0h_rep1.sorted.bam Ophnu1_Gene_model.gff3 >Ophnu_0h_rep1.count samtools sort -o Ophnu_0h_rep2.sorted.bam Ophnu_0h_rep2.bam htseq-count -t gene -i proteinId --stranded=no -f bam Ophnu_0h_rep2.sorted.bam Ophnu1_Gene_model.gff3 >Ophnu_0h_rep2.count samtools sort -o Ophnu_0h_rep3.sorted.bam Ophnu_0h_rep3.bam htseq-count -t gene -i proteinId --stranded=no -f bam Ophnu_0h_rep3.sorted.bam Ophnu1_Gene_model.gff3 >Ophnu_0h_rep3.count samtools sort -o Ophnu_27h_rep1.sorted.bam Ophnu_27h_rep1.bam htseq-count -t gene -i proteinId --stranded=no -f bam Ophnu_27h_rep1.sorted.bam Ophnu1_Gene_model.gff3 >Ophnu_27h_rep1.count samtools sort -o Ophnu_27h_rep2.sorted.bam Ophnu_27h_rep2.bam htseq-count -t gene -i proteinId --stranded=no -f bam Ophnu_27h_rep2.sorted.bam Ophnu1_Gene_model.gff3 >Ophnu_27h_rep2.count samtools sort -o Ophnu_27h_rep3.sorted.bam Ophnu_27h_rep3.bam htseq-count -t gene -i proteinId --stranded=no -f bam Ophnu_27h_rep3.sorted.bam Ophnu1_Gene_model.gff3 >Ophnu_27h_rep3.count echo "Job done" date +"%d %B %Y %H:%M:%S" cat htseq_20190111.sub ######################################################################### Results from htseq-count Ophnu Proportion of ambiguous alignment: 0.007 - 0.011% Proportion of reads not have a unique aligment: 0.34 - 0.49% Proportion of reads mapped to no features: 6.8 - 13.8% #########################################################################