Overall of UstRNASeq pipeline version 1: December 2018
Teeratas Kijpornyongpan
Start from filtered reads (PE reads having adapters and low-quality bases trimmed)
Do tophat to map reads to the reference genomes --> Then ht-seq count to get known transcripts from .gff annotation

#Jan 7, 2019 --> Found from ht-seq count results that many reads are mapped to no feature
#            --> think about mapping reads to de novo transcriptome assembly in the future, but not for now.
#########################################################################

Jobfile Trimmomatics
However, Genomics Core already did the trimming for us. Just note the scripts they used

#PBS -q mycospace
#PBS -l walltime=168:00:00
#PBS -l nodes=1:ppn=5

module load java
module load trimmomatic/0.36

echo "Job started"
date +"%d %B %Y %H:%M:%S"

java -classpath /group/bioinfo/apps/apps/trimmomatic-0.36/trimmomatic-0.36.jar org.usadellab.trimmomatic.TrimmomaticPE  \
-threads 5 -phred33 /depot/maime/data/Tas/Dimorphism/Dimorphism_RNASeq_October2018/MCA3882_dimorphism_RNASeq/035912_MCA3882-G-R1/Unaligned_filtered/035912_MCA3882-G-R1_S71_R1_filtered.fastq.gz \
/depot/maime/data/Tas/Dimorphism/Dimorphism_RNASeq_October2018/MCA3882_dimorphism_RNASeq/035912_MCA3882-G-R1/Unaligned_filtered/035912_MCA3882-G-R1_S71_R2_filtered.fastq.gz \
./MCA3882/MCA3882-G-R1_Trimmed_R1_filtered.fastq.gz ./MCA3882/MCA3882-G-R1_Trimmed_R2_filtered.fastq.gz

TruSeq3-PE-2.fa
LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:32 

echo "Job done"
date +"%d %B %Y %H:%M:%S"
cat Trimmomatic_20181126.sub

#########################################################################
#####Notes from Purdue Genomics Core#####
Unaligned_filtered We run either Trimmomatic and/or cutadapt in order 
 to remove adapters. Poor quality bases (default of less than Phred-20)
 are removed (clipped) from both the 5' and 3' ends of the Unaligned 
 reads. After adapter removal and quality clipping any reads below a 
 minimum length (default 30 bases) are discarded. The Unaligned_filtered
 reads are the ones most often used for further processing. 
 The filtered reads for a sample are put all together into one large 
 file per read direction. You can access these either via the given link
 the 'Unaligned_filtered' directory(s) below or via the per-sample 
 directory. For paired-end reads it is possible that one of the reads 
 in a pair is still viable while the other is not. Viable now-single-end
 reads are put in the 'Unaligned_filtered_unpaired' directory. 
For paired-end (99% of the reads we do), the code used is: trimmomatic PE
 LEADING:20 TRAILING:20 MINLEN:30 ILLUMINACLIP:our_vector_file:2:40:10
#########################################################################

####Start from mapping reads that have adapters and low base quality trimmed, use tophat to map reads #####
Jobfile TopHat

#PBS -q mycospace
#PBS -l walltime=168:00:00
#PBS -l nodes=1:ppn=5

echo "Job started"
date +"%d %B %Y %H:%M:%S"

cd /scratch/halstead/t/tkijporn/RNASeq_workingspace/TopHat_references
module load bowtie2/2.2.9
module load perl
###Build genome references ###
bowtie2­build Meimi1_Repeatmasked.fasta Meimi1_Genome_Ref
bowtie2­build Tilwa1_Repeatmasked.fasta Tilwa1_Genome_Ref


###Then, do tophat ###
cd ../MCA3882/

module load tophat/2.1.1
module load samtools/1.9

tophat -p 5 -o MCA3882_G-R1_tophat ../TopHat_references/Meimi1_Genome_Ref \
/depot/maime/data/Tas/Dimorphism/Dimorphism_RNASeq_October2018/MCA3882_dimorphism_RNASeq/035912_MCA3882-G-R1/Unaligned_filtered/035912_MCA3882-G-R1_S71_R1_filtered.fastq.gz \
/depot/maime/data/Tas/Dimorphism/Dimorphism_RNASeq_October2018/MCA3882_dimorphism_RNASeq/035912_MCA3882-G-R1/Unaligned_filtered/035912_MCA3882-G-R1_S71_R2_filtered.fastq.gz

tophat -p 5 -o MCA3882_G-R3_tophat ../TopHat_references/Meimi1_Genome_Ref \
/depot/maime/data/Tas/Dimorphism/Dimorphism_RNASeq_October2018/MCA3882_dimorphism_RNASeq/035913_MCA3882-G-R3/Unaligned_filtered/035913_MCA3882-G-R3_S72_R1_filtered.fastq.gz \
/depot/maime/data/Tas/Dimorphism/Dimorphism_RNASeq_October2018/MCA3882_dimorphism_RNASeq/035913_MCA3882-G-R3/Unaligned_filtered/035913_MCA3882-G-R3_S72_R2_filtered.fastq.gz

tophat -p 5 -o MCA3882_G-R5_tophat ../TopHat_references/Meimi1_Genome_Ref \
/depot/maime/data/Tas/Dimorphism/Dimorphism_RNASeq_October2018/MCA3882_dimorphism_RNASeq/035914_MCA3882-G-R5/Unaligned_filtered/035914_MCA3882-G-R5_S73_R1_filtered.fastq.gz \
/depot/maime/data/Tas/Dimorphism/Dimorphism_RNASeq_October2018/MCA3882_dimorphism_RNASeq/035914_MCA3882-G-R5/Unaligned_filtered/035914_MCA3882-G-R5_S73_R2_filtered.fastq.gz

tophat -p 5 -o MCA3882_T-R2_tophat ../TopHat_references/Meimi1_Genome_Ref \
/depot/maime/data/Tas/Dimorphism/Dimorphism_RNASeq_October2018/MCA3882_dimorphism_RNASeq/035915_MCA3882-T-R2/Unaligned_filtered/035915_MCA3882-T-R2_S74_R1_filtered.fastq.gz \
/depot/maime/data/Tas/Dimorphism/Dimorphism_RNASeq_October2018/MCA3882_dimorphism_RNASeq/035915_MCA3882-T-R2/Unaligned_filtered/035915_MCA3882-T-R2_S74_R2_filtered.fastq.gz

tophat -p 5 -o MCA3882_T-R3_tophat ../TopHat_references/Meimi1_Genome_Ref \
/depot/maime/data/Tas/Dimorphism/Dimorphism_RNASeq_October2018/MCA3882_dimorphism_RNASeq/035916_MCA3882-T-R3/Unaligned_filtered/035916_MCA3882-T-R3_S75_R1_filtered.fastq.gz \
/depot/maime/data/Tas/Dimorphism/Dimorphism_RNASeq_October2018/MCA3882_dimorphism_RNASeq/035916_MCA3882-T-R3/Unaligned_filtered/035916_MCA3882-T-R3_S75_R2_filtered.fastq.gz

tophat -p 5 -o MCA3882_T-R6_tophat ../TopHat_references/Meimi1_Genome_Ref \
/depot/maime/data/Tas/Dimorphism/Dimorphism_RNASeq_October2018/MCA3882_dimorphism_RNASeq/035917_MCA3882-T-R6/Unaligned_filtered/035917_MCA3882-T-R6_S76_R1_filtered.fastq.gz \
/depot/maime/data/Tas/Dimorphism/Dimorphism_RNASeq_October2018/MCA3882_dimorphism_RNASeq/035917_MCA3882-T-R6/Unaligned_filtered/035917_MCA3882-T-R6_S76_R2_filtered.fastq.gz

cd ../MCA4186

tophat -p 5 -o MCA4186_G-R2_tophat ../TopHat_references/Tilwa1_Genome_Ref \
/depot/maime/data/Tas/Dimorphism/Dimorphism_RNASeq_October2018/MCA4186_dimorphism_RNASeq/034579_MCA4186-G-R2/Unaligned_filtered/034579_MCA4186-G-R2_S5_R1_filtered.fastq.gz \
/depot/maime/data/Tas/Dimorphism/Dimorphism_RNASeq_October2018/MCA4186_dimorphism_RNASeq/034579_MCA4186-G-R2/Unaligned_filtered/034579_MCA4186-G-R2_S5_R2_filtered.fastq.gz

tophat -p 5 -o MCA4186_G-R3_tophat ../TopHat_references/Tilwa1_Genome_Ref \
/depot/maime/data/Tas/Dimorphism/Dimorphism_RNASeq_October2018/MCA4186_dimorphism_RNASeq/034580_MCA4186-G-R3/Unaligned_filtered/034580_MCA4186-G-R3_S6_R1_filtered.fastq.gz \
/depot/maime/data/Tas/Dimorphism/Dimorphism_RNASeq_October2018/MCA4186_dimorphism_RNASeq/034580_MCA4186-G-R3/Unaligned_filtered/034580_MCA4186-G-R3_S6_R2_filtered.fastq.gz

tophat -p 5 -o MCA4186_G-R4_tophat ../TopHat_references/Tilwa1_Genome_Ref \
/depot/maime/data/Tas/Dimorphism/Dimorphism_RNASeq_October2018/MCA4186_dimorphism_RNASeq/034581_MCA4186-G-R4/Unaligned_filtered/034581_MCA4186-G-R4_S7_R1_filtered.fastq.gz \
/depot/maime/data/Tas/Dimorphism/Dimorphism_RNASeq_October2018/MCA4186_dimorphism_RNASeq/034581_MCA4186-G-R4/Unaligned_filtered/034581_MCA4186-G-R4_S7_R2_filtered.fastq.gz

tophat -p 5 -o MCA4186_G-R5_tophat ../TopHat_references/Tilwa1_Genome_Ref \
/depot/maime/data/Tas/Dimorphism/Dimorphism_RNASeq_October2018/MCA4186_dimorphism_RNASeq/034582_MCA4186-G-R5/Unaligned_filtered/034582_MCA4186-G-R5_S8_R1_filtered.fastq.gz \
/depot/maime/data/Tas/Dimorphism/Dimorphism_RNASeq_October2018/MCA4186_dimorphism_RNASeq/034582_MCA4186-G-R5/Unaligned_filtered/034582_MCA4186-G-R5_S8_R2_filtered.fastq.gz

tophat -p 5 -o MCA4186_T-R2_tophat ../TopHat_references/Tilwa1_Genome_Ref \
/depot/maime/data/Tas/Dimorphism/Dimorphism_RNASeq_October2018/MCA4186_dimorphism_RNASeq/034584_MCA4186-T-R2/Unaligned_filtered/034584_MCA4186-T-R2_S9_R1_filtered.fastq.gz \
/depot/maime/data/Tas/Dimorphism/Dimorphism_RNASeq_October2018/MCA4186_dimorphism_RNASeq/034584_MCA4186-T-R2/Unaligned_filtered/034584_MCA4186-T-R2_S9_R2_filtered.fastq.gz

tophat -p 5 -o MCA4186_T-R3_tophat ../TopHat_references/Tilwa1_Genome_Ref \
/depot/maime/data/Tas/Dimorphism/Dimorphism_RNASeq_October2018/MCA4186_dimorphism_RNASeq/034585_MCA4186-T-R3/Unaligned_filtered/034585_MCA4186-T-R3_S10_R1_filtered.fastq.gz \
/depot/maime/data/Tas/Dimorphism/Dimorphism_RNASeq_October2018/MCA4186_dimorphism_RNASeq/034585_MCA4186-T-R3/Unaligned_filtered/034585_MCA4186-T-R3_S10_R2_filtered.fastq.gz

tophat -p 5 -o MCA4186_T-R4_tophat ../TopHat_references/Tilwa1_Genome_Ref \
/depot/maime/data/Tas/Dimorphism/Dimorphism_RNASeq_October2018/MCA4186_dimorphism_RNASeq/034586_MCA4186-T-R4/Unaligned_filtered/034586_MCA4186-T-R4_S11_R1_filtered.fastq.gz \
/depot/maime/data/Tas/Dimorphism/Dimorphism_RNASeq_October2018/MCA4186_dimorphism_RNASeq/034586_MCA4186-T-R4/Unaligned_filtered/034586_MCA4186-T-R4_S11_R2_filtered.fastq.gz

tophat -p 5 -o MCA4186_T-R5_tophat ../TopHat_references/Tilwa1_Genome_Ref \
/depot/maime/data/Tas/Dimorphism/Dimorphism_RNASeq_October2018/MCA4186_dimorphism_RNASeq/034587_MCA4186-T-R5/Unaligned_filtered/034587_MCA4186-T-R5_S12_R1_filtered.fastq.gz \
/depot/maime/data/Tas/Dimorphism/Dimorphism_RNASeq_October2018/MCA4186_dimorphism_RNASeq/034587_MCA4186-T-R5/Unaligned_filtered/034587_MCA4186-T-R5_S12_R2_filtered.fastq.gz

cd ..

echo "Job done"
date +"%d %B %Y %H:%M:%S"
cat tophat_20181127.sub

#########################################################################
#####Summary on TopHat #####

MCA3882_G-R1 TopHat
Left reads:
          Input     :  10296423
           Mapped   :  10193479 (99.0% of input)
            of these:     64059 ( 0.6%) have multiple alignments (138 have >20)
Right reads:
          Input     :  10296423
           Mapped   :  10104013 (98.1% of input)
            of these:     63455 ( 0.6%) have multiple alignments (138 have >20)
98.6% overall read mapping rate.

Aligned pairs:  10025456
     of these:     62380 ( 0.6%) have multiple alignments
                  143215 ( 1.4%) are discordant alignments
96.0% concordant pair alignment rate.


MCA3882_G-R3 TopHat
Left reads:
          Input     :   6900636
           Mapped   :   6830564 (99.0% of input)
            of these:     41203 ( 0.6%) have multiple alignments (86 have >20)
Right reads:
          Input     :   6900636
           Mapped   :   6765444 (98.0% of input)
            of these:     40885 ( 0.6%) have multiple alignments (86 have >20)
98.5% overall read mapping rate.

Aligned pairs:   6712545
     of these:     40155 ( 0.6%) have multiple alignments
                  102003 ( 1.5%) are discordant alignments
95.8% concordant pair alignment rate.


MCA3882_G-R5 TopHat
Left reads:
          Input     :   6908742
           Mapped   :   6853565 (99.2% of input)
            of these:     38619 ( 0.6%) have multiple alignments (65 have >20)
Right reads:
          Input     :   6908742
           Mapped   :   6769125 (98.0% of input)
            of these:     38145 ( 0.6%) have multiple alignments (65 have >20)
98.6% overall read mapping rate.

Aligned pairs:   6731191
     of these:     37607 ( 0.6%) have multiple alignments
                   28949 ( 0.4%) are discordant alignments
97.0% concordant pair alignment rate.


MCA3882_T-R2 TopHat
Left reads:
          Input     :   4421922
           Mapped   :   4380451 (99.1% of input)
            of these:     27090 ( 0.6%) have multiple alignments (51 have >20)
Right reads:
          Input     :   4421922
           Mapped   :   4323437 (97.8% of input)
            of these:     26832 ( 0.6%) have multiple alignments (51 have >20)
98.4% overall read mapping rate.

Aligned pairs:   4295688
     of these:     26279 ( 0.6%) have multiple alignments
                   96485 ( 2.2%) are discordant alignments
95.0% concordant pair alignment rate.


MCA3882_T-R3 TopHat
Left reads:
          Input     :   9691723
           Mapped   :   9571560 (98.8% of input)
            of these:     68039 ( 0.7%) have multiple alignments (126 have >20)
Right reads:
          Input     :   9691723
           Mapped   :   9466628 (97.7% of input)
            of these:     67452 ( 0.7%) have multiple alignments (126 have >20)
98.2% overall read mapping rate.

Aligned pairs:   9375317
     of these:     66192 ( 0.7%) have multiple alignments
                 1237061 (13.2%) are discordant alignments
84.0% concordant pair alignment rate.


MCA3882_T-R6 TopHat
Left reads:
          Input     :   5635843
           Mapped   :   5585689 (99.1% of input)
            of these:     35429 ( 0.6%) have multiple alignments (135 have >20)
Right reads:
          Input     :   5635843
           Mapped   :   5538440 (98.3% of input)
            of these:     35295 ( 0.6%) have multiple alignments (135 have >20)
98.7% overall read mapping rate.

Aligned pairs:   5500212
     of these:     34490 ( 0.6%) have multiple alignments
                  343687 ( 6.2%) are discordant alignments
91.5% concordant pair alignment rate.


MCA4186_G-R2 TopHat
Left reads:
          Input     :  15509071
           Mapped   :  13860470 (89.4% of input)
            of these:    217383 ( 1.6%) have multiple alignments (17 have >20)
Right reads:
          Input     :  15509071
           Mapped   :  13762044 (88.7% of input)
            of these:    216636 ( 1.6%) have multiple alignments (17 have >20)
89.1% overall read mapping rate.

Aligned pairs:  13520722
     of these:    214016 ( 1.6%) have multiple alignments
                  113872 ( 0.8%) are discordant alignments
86.4% concordant pair alignment rate.


MCA4186_G-R3 TopHat
Left reads:
          Input     :  13997081
           Mapped   :  12894158 (92.1% of input)
            of these:    203012 ( 1.6%) have multiple alignments (24 have >20)
Right reads:
          Input     :  13997081
           Mapped   :  12781984 (91.3% of input)
            of these:    202504 ( 1.6%) have multiple alignments (24 have >20)
91.7% overall read mapping rate.

Aligned pairs:  12548473
     of these:    199674 ( 1.6%) have multiple alignments
                  206940 ( 1.6%) are discordant alignments
88.2% concordant pair alignment rate.


MCA4186_G-R4 TopHat
Left reads:
          Input     :  17449829
           Mapped   :  12380710 (71.0% of input)
            of these:    192909 ( 1.6%) have multiple alignments (18 have >20)
Right reads:
          Input     :  17449829
           Mapped   :  12177628 (69.8% of input)
            of these:    191376 ( 1.6%) have multiple alignments (19 have >20)
70.4% overall read mapping rate.

Aligned pairs:  11911676
     of these:    188409 ( 1.6%) have multiple alignments
                  131168 ( 1.1%) are discordant alignments
67.5% concordant pair alignment rate.


MCA4186_G-R5 TopHat
Left reads:
          Input     :  19403721
           Mapped   :  17626165 (90.8% of input)
            of these:    285962 ( 1.6%) have multiple alignments (42 have >20)
Right reads:
          Input     :  19403721
           Mapped   :  17498556 (90.2% of input)
            of these:    285795 ( 1.6%) have multiple alignments (42 have >20)
90.5% overall read mapping rate.

Aligned pairs:  17153794
     of these:    281277 ( 1.6%) have multiple alignments
                  353185 ( 2.1%) are discordant alignments
86.6% concordant pair alignment rate.


MCA4186_T-R2 TopHat
Left reads:
          Input     :  18489430
           Mapped   :  17424419 (94.2% of input)
            of these:    290816 ( 1.7%) have multiple alignments (46 have >20)
Right reads:
          Input     :  18489430
           Mapped   :  17315707 (93.7% of input)
            of these:    290925 ( 1.7%) have multiple alignments (46 have >20)
93.9% overall read mapping rate.

Aligned pairs:  16985658
     of these:    286263 ( 1.7%) have multiple alignments
                  380041 ( 2.2%) are discordant alignments
89.8% concordant pair alignment rate.


MCA4186_T-R3 TopHat
Left reads:
          Input     :  18741730
           Mapped   :  17966515 (95.9% of input)
            of these:    327099 ( 1.8%) have multiple alignments (21 have >20)
Right reads:
          Input     :  18741730
           Mapped   :  17845083 (95.2% of input)
            of these:    326817 ( 1.8%) have multiple alignments (21 have >20)
95.5% overall read mapping rate.

Aligned pairs:  17530620
     of these:    322127 ( 1.8%) have multiple alignments
                  327694 ( 1.9%) are discordant alignments
91.8% concordant pair alignment rate.


MCA4186_T-R4 TopHat
Left reads:
          Input     :  17417303
           Mapped   :  16412017 (94.2% of input)
            of these:    279739 ( 1.7%) have multiple alignments (19 have >20)
Right reads:
          Input     :  17417303
           Mapped   :  16279223 (93.5% of input)
            of these:    278487 ( 1.7%) have multiple alignments (19 have >20)
93.8% overall read mapping rate.

Aligned pairs:  16031329
     of these:    275813 ( 1.7%) have multiple alignments
                  100771 ( 0.6%) are discordant alignments
91.5% concordant pair alignment rate.


MCA4186_T-R5 TopHat
Left reads:
          Input     :   7870313
           Mapped   :   7408882 (94.1% of input)
            of these:    108213 ( 1.5%) have multiple alignments (6 have >20)
Right reads:
          Input     :   7870313
           Mapped   :   7312154 (92.9% of input)
            of these:    107745 ( 1.5%) have multiple alignments (6 have >20)
93.5% overall read mapping rate.

Aligned pairs:   7179465
     of these:    106108 ( 1.5%) have multiple alignments
                  133660 ( 1.9%) are discordant alignments
89.5% concordant pair alignment rate.
#########################################################################

#####Then, transform .bam file into read counts using htseq #####
Jobfile htseq for Tilwa

#PBS -q mycospace
#PBS -l walltime=50:00:00
#PBS -l nodes=1:ppn=1
#PBS -N htseq_Tilwa
echo "Job started"
date +"%d %B %Y %H:%M:%S"

cd $PBS_O_WORKDIR
module load samtools/1.9
module load biopython/2.7.12
module load htseq/0.7.0

####Read counts using htseq --> take around one hour-ish for each .bam file#####
##Will start by sorting alignment using samtools, then go for htseq count ##
## -t CDS means collects the CDS data, -i proteinId is a unique identifier in .gff file, -f bam specifies an input file as a .bam file
## then go for input alignment file and .gff file, ending with > [output file]

samtools sort -o MCA4186_G-R2.sorted.bam MCA4186_G-R2.bam
htseq-count -t CDS -i proteinId  --stranded=no -f bam MCA4186_G-R2.sorted.bam Tilwa1_GeneCatalog_genes_20141225.gff >MCA4186_G-R2.count

samtools sort -o MCA4186_G-R3.sorted.bam MCA4186_G-R3.bam
htseq-count -t CDS -i proteinId  --stranded=no -f bam MCA4186_G-R3.sorted.bam Tilwa1_GeneCatalog_genes_20141225.gff >MCA4186_G-R3.count

samtools sort -o MCA4186_G-R4.sorted.bam MCA4186_G-R4.bam
htseq-count -t CDS -i proteinId  --stranded=no -f bam MCA4186_G-R4.sorted.bam Tilwa1_GeneCatalog_genes_20141225.gff >MCA4186_G-R4.count

samtools sort -o MCA4186_G-R5.sorted.bam MCA4186_G-R5.bam
htseq-count -t CDS -i proteinId  --stranded=no -f bam MCA4186_G-R5.sorted.bam Tilwa1_GeneCatalog_genes_20141225.gff >MCA4186_G-R5.count

samtools sort -o MCA4186_T-R2.sorted.bam MCA4186_T-R2.bam
htseq-count -t CDS -i proteinId  --stranded=no -f bam MCA4186_T-R2.sorted.bam Tilwa1_GeneCatalog_genes_20141225.gff >MCA4186_T-R2.count

samtools sort -o MCA4186_T-R3.sorted.bam MCA4186_T-R3.bam
htseq-count -t CDS -i proteinId  --stranded=no -f bam MCA4186_T-R3.sorted.bam Tilwa1_GeneCatalog_genes_20141225.gff >MCA4186_T-R3.count

samtools sort -o MCA4186_T-R4.sorted.bam MCA4186_T-R4.bam
htseq-count -t CDS -i proteinId  --stranded=no -f bam MCA4186_T-R4.sorted.bam Tilwa1_GeneCatalog_genes_20141225.gff >MCA4186_T-R4.count

samtools sort -o MCA4186_T-R5.sorted.bam MCA4186_T-R5.bam
htseq-count -t CDS -i proteinId  --stranded=no -f bam MCA4186_T-R5.sorted.bam Tilwa1_GeneCatalog_genes_20141225.gff >MCA4186_T-R5.count

echo "Job done"
date +"%d %B %Y %H:%M:%S"
cat htseq_20190107.sub
#########################################################################

Jobfile htseq for Meimi

#PBS -q mycospace
#PBS -l walltime=50:00:00
#PBS -l nodes=1:ppn=1
#PBS -N htseq_Meimi
echo "Job started"
date +"%d %B %Y %H:%M:%S"

cd $PBS_O_WORKDIR
module load samtools/1.9
module load biopython/2.7.12
module load htseq/0.7.0

####Read counts using htseq --> take around one hour-ish for each .bam file#####
##Will start by sorting alignment using samtools, then go for htseq count ##
## -t CDS means collects the CDS data, -i proteinId is a unique identifier in .gff file, -f bam specifies an input file as a .bam file
## then go for input alignment file and .gff file, ending with > [output file]

samtools sort -o MCA3882_G-R1.sorted.bam MCA3882_G-R1.bam
htseq-count -t CDS -i proteinId  --stranded=no -f bam MCA3882_G-R1.sorted.bam Meimi1_GeneCatalog_genes_20150112.gff >MCA3882_G-R1.count

samtools sort -o MCA3882_G-R3.sorted.bam MCA3882_G-R3.bam
htseq-count -t CDS -i proteinId  --stranded=no -f bam MCA3882_G-R3.sorted.bam Meimi1_GeneCatalog_genes_20150112.gff >MCA3882_G-R3.count

samtools sort -o MCA3882_G-R5.sorted.bam MCA3882_G-R5.bam
htseq-count -t CDS -i proteinId  --stranded=no -f bam MCA3882_G-R5.sorted.bam Meimi1_GeneCatalog_genes_20150112.gff >MCA3882_G-R5.count

samtools sort -o MCA3882_T-R2.sorted.bam MCA3882_T-R2.bam
htseq-count -t CDS -i proteinId  --stranded=no -f bam MCA3882_T-R2.sorted.bam Meimi1_GeneCatalog_genes_20150112.gff >MCA3882_T-R2.count

samtools sort -o MCA3882_T-R3.sorted.bam MCA3882_T-R3.bam
htseq-count -t CDS -i proteinId  --stranded=no -f bam MCA3882_T-R3.sorted.bam Meimi1_GeneCatalog_genes_20150112.gff >MCA3882_T-R3.count

samtools sort -o MCA3882_T-R6.sorted.bam MCA3882_T-R6.bam
htseq-count -t CDS -i proteinId  --stranded=no -f bam MCA3882_T-R6.sorted.bam Meimi1_GeneCatalog_genes_20150112.gff >MCA3882_T-R6.count

echo "Job done"
date +"%d %B %Y %H:%M:%S"
cat htseq_20190107.sub

Jobfile htseq for Tilwa

Job started
07 January 2019 14:45:41
Job done
07 January 2019 18:56:29
#PBS -q mycospace
#PBS -l walltime=50:00:00
#PBS -l nodes=1:ppn=1
#PBS -N htseq_Tilwa
echo "Job started"
date +"%d %B %Y %H:%M:%S"

cd $PBS_O_WORKDIR
module load samtools/1.9
module load biopython/2.7.12
module load htseq/0.7.0

#####Read counts using htseq --> take around one hour-ish for each .bam file#####
##Will start by sorting alignment using samtools, then go for htseq count ##
## -t CDS means collects the CDS data, -i proteinId is a unique identifier in .gff file, -f bam specifies an input file as a .bam file
## then go for input alignment file and .gff file, ending with > [output file]

samtools sort -o MCA4186_G-R2.sorted.bam MCA4186_G-R2.bam
htseq-count -t CDS -i proteinId  --stranded=no -f bam MCA4186_G-R2.sorted.bam Tilwa1_GeneCatalog_genes_20141225.gff >MCA4186_G-R2.count

samtools sort -o MCA4186_G-R3.sorted.bam MCA4186_G-R3.bam
htseq-count -t CDS -i proteinId  --stranded=no -f bam MCA4186_G-R3.sorted.bam Tilwa1_GeneCatalog_genes_20141225.gff >MCA4186_G-R3.count

samtools sort -o MCA4186_G-R4.sorted.bam MCA4186_G-R4.bam
htseq-count -t CDS -i proteinId  --stranded=no -f bam MCA4186_G-R4.sorted.bam Tilwa1_GeneCatalog_genes_20141225.gff >MCA4186_G-R4.count

samtools sort -o MCA4186_G-R5.sorted.bam MCA4186_G-R5.bam
htseq-count -t CDS -i proteinId  --stranded=no -f bam MCA4186_G-R5.sorted.bam Tilwa1_GeneCatalog_genes_20141225.gff >MCA4186_G-R5.count

samtools sort -o MCA4186_T-R2.sorted.bam MCA4186_T-R2.bam
htseq-count -t CDS -i proteinId  --stranded=no -f bam MCA4186_T-R2.sorted.bam Tilwa1_GeneCatalog_genes_20141225.gff >MCA4186_T-R2.count

samtools sort -o MCA4186_T-R3.sorted.bam MCA4186_T-R3.bam
htseq-count -t CDS -i proteinId  --stranded=no -f bam MCA4186_T-R3.sorted.bam Tilwa1_GeneCatalog_genes_20141225.gff >MCA4186_T-R3.count

samtools sort -o MCA4186_T-R4.sorted.bam MCA4186_T-R4.bam
htseq-count -t CDS -i proteinId  --stranded=no -f bam MCA4186_T-R4.sorted.bam Tilwa1_GeneCatalog_genes_20141225.gff >MCA4186_T-R4.count

samtools sort -o MCA4186_T-R5.sorted.bam MCA4186_T-R5.bam
htseq-count -t CDS -i proteinId  --stranded=no -f bam MCA4186_T-R5.sorted.bam Tilwa1_GeneCatalog_genes_20141225.gff >MCA4186_T-R5.count

echo "Job done"
date +"%d %B %Y %H:%M:%S"
cat htseq_20190107.sub

#########################################################################
Results from htseq-count so far
Meimi (MCA3882)
Proportion of ambiguous alignment: < 0.01%
Proportion of reads not have a unique aligment: 1.5-2.0%
Proportion of reads mapped to no features: 14-16%

Meimi (MCA4186)
Proportion of ambiguous alignment: 0.05-0.06%
Proportion of reads not have a unique aligment: 3.2-3.7%
Proportion of reads mapped to no features: 12-15%
#########################################################################