###################################################################### # Get the GEO html file, parse the SRP ID (SRA serie ID), parse the # SRX ID (SRA sample ID), then the SRR ID (SRA run ID) and download # them. Samples and runs IDs are stored in the two files 'samples' and # 'samples2' ###################################################################### GSE=GSE39128 wget -q http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=$GSE -O $GSE.html SRP=$(grep -o 'SRP[0-9]*<' $GSE.html | sort -u | sed 's/ samples # delete from this list the control samples nypgal and nypd sed -i '/nyp/d' samples for SRX in $(awk '{print $1}' samples) do wget -q http://www.ncbi.nlm.nih.gov/sra/$SRX -O $SRX.html for SRR in $(grep -o 'SRR[0-9]*' $SRX.html | sort -u); do SRRDIR=$(echo $SRR | awk 'BEGIN{FS=""}{print $1 $2 $3 $4 $5 $6}') wget https://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/$SRRDIR/$SRR/$SRR.sra echo "$SRX $SRR" >> samples2 done done ###################################################################### # Extract fastq files of each run using fastq-dump (v. 2.3.5 download # from # http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?cmd=show&f=software&\ # m=software&s=software ). ###################################################################### for FILE in $(ls *.sra) do fastq-dump --split-3 $FILE done ###################################################################### # The protocol to construct the sequencing library is quite different # from the normal TSS mapping promotocol. In this case, it in designed # to map both ends of a transcript. To do so, it links them using an # adapter. Sequencing adapters (with barcodes) are then added within # the transcript resulting in reads that start on the transcript, goes # towards the ends and cover the adapter. This results in the 5' end # being sequence on the reverse, with the TSS located in the middle of # the reads. Since I am only interested in the 5' ends, I have to map # the adapter to the reads (the reverse-complement of it), extract the # sequence before its starting point (this is the TSS) and get the # reverse complement of it. I have to check if I have to trim the # sequencing barcodes that, in this scenario, will be at the end of # the resulting reads. This seems to be the case. Looking at the reads # there is a barcode of 7 nucleotides at the beginning of each read. # These are 4 adapters that has been used: # Intromolecular (A-B format) AAAAAACTCCAGGCGGCCGCTATCACTCTGAGCAATACC # Intromolecular (B-A form) AAAAAAGTGGAGGCGGCCGCTATATCACTCTGAGCAATACC # Intermolecular (A-A form) AAAAAACTCCAGGCGGCCGCTATATCACTCTGAGCAATACC # Intermolecular (B-B format) AAAAAAGTGGAGGCGGCCGCTATCACTCTGAGCAATACC # I am only interested in the last bases of the adapters since they # define the boundaries with the TSS. I have to map the reverse # complement of the last part of the adapter to the reads (perfect # match only). It is good to notice that all of them have the same # sequence in that part. Looking in PWMScan, a sequence of 12 bp long # (GGTATTGCTCAG, the reverse-complement of adapters' last 12 bases) # has only one hit in the S. cerevisie genome. Reducing it to 11 bases # bring the hits to 7 and is present in almost all reads of a # sample. I will use this [GGTATTGCTCA] to scan the reads, trim them # of the adapter, reverse complement the resulting reads, trim the # barcode and map them to the genome. # P.S.: The sequencing is paired-end (fwd-rev). I have to do the same # procedure for both files and then merge them into one since I am # interest only in one end. I will then map it as if it was single-end for I in $(awk '{print $2}' samples2); do # get rid of the adapter and barcode: echo $I cat $I\_[12].fastq | ./mapAdapter.pl > $I\_trimmed.fastq 2> /dev/null # map to the genome (bowtie and bowtie2): echo $I >> bowtie.output bowtie -p 30 /home/local/db/bowtie/s_cerevisiae_sacCer3 -q $I\_trimmed.fastq -S $I.bowtie.sam 2>> bowtie.output echo $I >> bowtie2.output bowtie2 -p 30 -x /home/local/db/bowtie2/s_cerevisiae_sacCer3 -q $I\_trimmed.fastq -S $I.bowtie2.sam 2>> bowtie2.output done # bowtie2 deliver better alignemt rate on all samples. I will use its # results. ###################################################################### # Generate SGA files for each samples using samtools (v. , download # from http://samtools.sourceforge.net/ ) and ChIP-Seq tools (download # from http://sourceforge.net/projects/chip-seq/ ) ###################################################################### for SRR in $(awk '{print $2}' samples2); do SRX=$(awk -v "var=$SRR" '$2==var {print $1}' samples2) NAME=$(awk -v "var=$SRX" '$1==var {print $2}' samples | sed 's/://') FEATURE=TSSseq echo "Processing $NAME -> $FEATURE" if [ -f $SRR.bed ]; then echo " bed file already done" else samtools view -bS -o $SRR.bam $SRR.bowtie2.sam 2> /dev/null bamToBed -i $SRR.bam > $SRR.bed fi # get rid of the strange chromosome names sed -i 's/^[^ ]*|//g' $SRR.bed bed2sga.pl -s sacCer3 -f $FEATURE < $SRR.bed | sort -s -k1,1 -k3,3n -k4,4 | compactsga > $SRR.sga if [ -f $NAME.sga ]; then mv $NAME.sga temp.sga sort -k1,1 -k3,3n -k4,4 $SRR.sga temp.sga | compactsga > $NAME.sga else mv $SRR.sga $NAME.sga fi done # Make the txt file: echo -e "Filename\tDescription\tFeature\tData-type\tOriented\tField-6\tGEO-ref\tTrack-URL\tFPS" > pelechano13.txt for SRX in $(awk '{print $1}' samples | sort -u); do # SRX=$(awk -v "var=$SRR" '$2==var {print $1}' samples2) NAME=$(awk -v "var=$SRX" '$1==var {print $2}' samples | sed 's/://') MEDIA=$(awk -v "var=$SRX" '$1==var {split($3,name,"_"); print name[1]}' samples | sed 's/://') REP=$(awk -v "var=$SRX" '$1==var {split($3,name,"_"); print name[2]}' samples | sed 's/://') LIB=$(awk -v "var=$SRX" '$1==var {split($3,name,"_"); print name[3]}' samples | sed 's/://') FEATURE=TSSseq if [ "$MEDIA" == "ypgal" ]; then GM=Galactose else GM=Glucose fi echo -e "$NAME.sga\tWT|$FEATURE|$GM|$LIB$REP\t$FEATURE\tRNA-seq\tT\t-\t$NAME\t-\t-" done | sort -k2,2 | sed 's/;/-/' >> pelechano13.txt