#!/usr/local/bin/perl #transORF.pl #Adapted by Leonard Foster from code by #Author: Paul Stothard, Genome Canada Bioinformatics Help Desk #This script takes a DNA sequence file (in FASTA format) translates #ORFs of a given length in all six reading frames #Usage: perl transOrf.pl -i -o #use warnings; #use strict; use Getopt::Std; getopt 'io'; %fastaFile = (); open (DNAFILE, $opt_i) || die "Can't open input file!\n"; $/ = undef; my $wholeFile = ; %fastaFile = $wholeFile =~ />(.+)\n([AaCcTtGgNn\n]+)/gm; close DNAFILE; #Declare a variable to hold the minimum ORF size you want to be returned #(in bases). my $MINORF = 102; #Declare a hash table for translating DNA into protein. my %translationHash = (gca => "A", gcg => "A", gct => "A", gcc => "A", gcn => "A", tgc => "C", tgt => "C", gat => "D", gac => "D", gaa => "E", gag => "E", ttt => "F", ttc => "F", gga => "G", ggg => "G", ggc => "G", ggt => "G", ggn => "G", cat => "H", cac => "H", ata => "I", att => "I", atc => "I", aaa => "K", aag => "K", cta => "L", ctg => "L", ctt => "L", ctc => "L", ctn => "L", tta => "L", ttg => "L", atg => "M", aat => "N", aac => "N", cca => "P", cct => "P", ccg => "P", ccc => "P", ccn => "P", caa => "Q", cag => "Q", cga => "R", cgg => "R", cgc => "R", cgt => "R", cgn => "R", aga => "R", agg => "R", tca => "S", tcg => "S", tcc => "S", tct => "S", tcn => "S", agc => "S", agt => "S", aca => "T", acg => "T", acc => "T", act => "T", acn => "T", gta => "V", gtg => "V", gtc => "V", gtt => "V", gtn => "V", tgg => "W", tat => "Y", tac => "Y", tag => "", taa => "", tga => ""); open (STDOUT, ">", $opt_o) || die "Can't create output file!\n"; my $joinedAminoAcids = $reverseComplement = $sequenceTitle = ""; foreach $entry (keys %fastaFile) { my @arrayOfORFs = @arrayOfTranslations = @arrayOFBases = @reversedArray = (); $directStrand = $fastaFile{$entry}; #Filter the DNA sequence. $directStrand =~ s/>[^\n]+//; $directStrand =~ tr/GATCN/gatcn/; $directStrand =~ s/[^gatcn]//g; #Determine the reverse-complement of the sequence. The reverse-complement #will be used when looking for ORFs on the opposite strand. $reverseComplement = $directStrand; $reverseComplement =~ tr/gatcn/ctagn/; @arrayOfBases = split(/\B/, $reverseComplement); @reversedArray = reverse(@arrayOfBases); $reverseComplement = join("",@reversedArray); #ORF finding code for (my $i = 0; $i < 2; $i = $i + 1) { my $sequenceEntry = ""; my $strand = ""; if ($i == 0) { #the first time the ORF finding code executes we will search #for ORFs on the direct strand. $sequenceEntry = $directStrand; $strand = "+"; } else { #the second time the ORF finding code executes we will search #for ORFs on the reverse-complement. $sequenceEntry = $reverseComplement; $strand = "-"; } my @startsRF1 =(); my @startsRF2 =(); my @startsRF3 =(); my @stopsRF1 = (); my @stopsRF2 = (); my @stopsRF3 = (); while ($sequenceEntry =~ m/\A(?!tag|taa|tga)/gi){ push (@startsRF1, 0); } while ($sequenceEntry =~ m/\A[AaCcGgTtNn](?!tag|taa|tga)/gi){ push (@startsRF2, 0); } while ($sequenceEntry =~ m/\A[AaCcGgTtNn]{2}?(?!tag|taa|tga)/gi){ push (@startsRF3, 0); } while ($sequenceEntry =~ m/atg/gi){ my $matchPosition = pos($sequenceEntry) - 3; if (($matchPosition % 3) == 0) { push (@startsRF1, $matchPosition); } elsif ((($matchPosition + 2) % 3) == 0) { push (@startsRF2, $matchPosition); } else { push (@startsRF3, $matchPosition); } } while ($sequenceEntry =~ m/tag|taa|tga/gi){ my $matchPosition = pos($sequenceEntry); if (($matchPosition % 3) == 0) { push (@stopsRF1, $matchPosition); } elsif ((($matchPosition + 2) % 3) == 0) { push (@stopsRF2, $matchPosition); } else { push (@stopsRF3, $matchPosition); } } my $codonRange = ""; my $startPosition = 0; my $stopPosition = 0; @startsRF1 = reverse(@startsRF1); @stopsRF1 = reverse(@stopsRF1); while (scalar(@startsRF1) > 0) { $codonRange = ""; $startPosition = pop(@startsRF1); if ($startPosition < $stopPosition) { next; } while (scalar(@stopsRF1) > 0) { $stopPosition = pop(@stopsRF1); if ($stopPosition > $startPosition) { last; } } if ($stopPosition <= $startPosition) { $stopPosition = length($sequenceEntry) - (length($sequenceEntry) % 3); $codonRange = $strand . "1 " . $startPosition . ".." . $stopPosition; push (@arrayOfORFs, $codonRange); last; } else { $codonRange = $strand . "1 " . $startPosition . ".." . $stopPosition; push (@arrayOfORFs, $codonRange); } } $stopPosition = 0; @startsRF2 = reverse(@startsRF2); @stopsRF2 = reverse(@stopsRF2); while (scalar(@startsRF2) > 0) { $codonRange = ""; $startPosition = pop(@startsRF2); if ($startPosition < $stopPosition) { next; } while (scalar(@stopsRF2) > 0) { $stopPosition = pop(@stopsRF2); if ($stopPosition > $startPosition) { last; } } if ($stopPosition <= $startPosition) { $stopPosition = length($sequenceEntry) - ((length($sequenceEntry) + 2) % 3); $codonRange = $strand . "2 " . $startPosition . ".." . $stopPosition; push (@arrayOfORFs, $codonRange); last; } else { $codonRange = $strand . "2 " . $startPosition . ".." . $stopPosition; push (@arrayOfORFs, $codonRange); } } $stopPosition = 0; @startsRF3 = reverse(@startsRF3); @stopsRF3 = reverse(@stopsRF3); while (scalar(@startsRF3) > 0) { $codonRange = ""; $startPosition = pop(@startsRF3); if ($startPosition < $stopPosition) { next; } while (scalar(@stopsRF3) > 0) { $stopPosition = pop(@stopsRF3); if ($stopPosition > $startPosition) { last; } } if ($stopPosition <= $startPosition) { $stopPosition = length($sequenceEntry) - ((length($sequenceEntry) + 1) % 3); $codonRange = $strand . "3 " . $startPosition . ".." . $stopPosition; push (@arrayOfORFs, $codonRange); last; } else { $codonRange = $strand . "3 " . $startPosition . ".." . $stopPosition; push (@arrayOfORFs, $codonRange); } } } #Use a loop to examine each ORF description in @arrayOfORFs. Translate ORFs that are #longer than $MINORF, and add those translations to @arrayOfTranslations foreach(@arrayOfORFs) { #Use the matching operator to copy parts of the ORF range into variables. #The strand will be stored in $1, the reading frame in $2, the start of #the ORF in $3, and the end of the ORF in $4. $_ =~ m/([\+\-])(\d)\s(\d+)\.\.(\d+)/; #Skip ORFs that are smaller than $MINORF if (($4 - $3) < $MINORF) { next; } #Declare a variable to hold the DNA sequence of the ORF. my $ORFsequence = ""; #If the ORF is on the forward strand, use "substr" to obtain the sequence #of the ORF from $directStrand. Otherwise, obtain the sequence from #$reverseComplement. if ($1 eq "+") { $ORFsequence = substr($directStrand, $3, $4 - $3); } else { $ORFsequence = substr($reverseComplement, $3, $4 - $3); } #Now use a for loop to translate the ORF sequence codon by codon. #The amino acids are added as elements to the array @growingProtein. my @growingProtein = (); for (my $i = 0; $i <= (length($ORFsequence) - 3); $i = $i + 3) { my $codon = substr($ORFsequence, $i, 3); if (exists( $translationHash{$codon} )){ push (@growingProtein, $translationHash{$codon}); } else { push (@growingProtein, "X"); } } #Now convert the @growingProtein array to a string. $joinedAminoAcids = join("",@growingProtein); #Uncomment the following line to create a reversed database #$joinedAminoAcids = reverse $joinedAminoAcids; #it may help to add the string "Rev" to the two assignments following. . . #Add a title to the protein sequence before adding it to @arrayOfTranslations. #For ORFs on the reverse strand adjust the position numbers so that they refer #to the direct strand. if ($1 eq "+") { $joinedAminoAcids = ">" . $entry . $1 . $2 . "_" . ($3 + 1) . " " . ($3 + 1) . " to " . $4 . ", " . ($4 - $3) . " bases.\n" . $joinedAminoAcids; } else { $joinedAminoAcids = ">" . $entry . $1 . $2 . "_" . (length($directStrand) - $4 + 1) . " " . (length($directStrand) - $4 + 1) . " to " . (length($directStrand) - $3) . ", " . ($4 - $3) . " bases.\n" . $joinedAminoAcids; } #The text contained in $joinedAminoAcids can now be added on to our array #of protein translations. push (@arrayOfTranslations, $joinedAminoAcids); } foreach(@arrayOfTranslations) { print "$_ \n"; } } close STDOUT;