#!/usr/local/bin/perl
#transORF.pl
#Adapted by Leonard Foster from code by
#Author: Paul Stothard, Genome Canada Bioinformatics Help Desk
#This script takes a DNA sequence file (in FASTA format) translates
#ORFs of a given length in all six reading frames
#Usage: perl transOrf.pl -i -o
#use warnings;
#use strict;
use Getopt::Std;
getopt 'io';
%fastaFile = ();
open (DNAFILE, $opt_i) || die "Can't open input file!\n";
$/ = undef;
my $wholeFile = ;
%fastaFile = $wholeFile =~ />(.+)\n([AaCcTtGgNn\n]+)/gm;
close DNAFILE;
#Declare a variable to hold the minimum ORF size you want to be returned
#(in bases).
my $MINORF = 102;
#Declare a hash table for translating DNA into protein.
my %translationHash =
(gca => "A", gcg => "A", gct => "A", gcc => "A", gcn => "A",
tgc => "C", tgt => "C",
gat => "D", gac => "D",
gaa => "E", gag => "E",
ttt => "F", ttc => "F",
gga => "G", ggg => "G", ggc => "G", ggt => "G", ggn => "G",
cat => "H", cac => "H",
ata => "I", att => "I", atc => "I",
aaa => "K", aag => "K",
cta => "L", ctg => "L", ctt => "L", ctc => "L", ctn => "L", tta => "L", ttg => "L",
atg => "M",
aat => "N", aac => "N",
cca => "P", cct => "P", ccg => "P", ccc => "P", ccn => "P",
caa => "Q", cag => "Q",
cga => "R", cgg => "R", cgc => "R", cgt => "R", cgn => "R",
aga => "R", agg => "R",
tca => "S", tcg => "S", tcc => "S", tct => "S", tcn => "S",
agc => "S", agt => "S",
aca => "T", acg => "T", acc => "T", act => "T", acn => "T",
gta => "V", gtg => "V", gtc => "V", gtt => "V", gtn => "V",
tgg => "W",
tat => "Y", tac => "Y",
tag => "", taa => "", tga => "");
open (STDOUT, ">", $opt_o) || die "Can't create output file!\n";
my $joinedAminoAcids = $reverseComplement = $sequenceTitle = "";
foreach $entry (keys %fastaFile)
{
my @arrayOfORFs = @arrayOfTranslations = @arrayOFBases = @reversedArray = ();
$directStrand = $fastaFile{$entry};
#Filter the DNA sequence.
$directStrand =~ s/>[^\n]+//;
$directStrand =~ tr/GATCN/gatcn/;
$directStrand =~ s/[^gatcn]//g;
#Determine the reverse-complement of the sequence. The reverse-complement
#will be used when looking for ORFs on the opposite strand.
$reverseComplement = $directStrand;
$reverseComplement =~ tr/gatcn/ctagn/;
@arrayOfBases = split(/\B/, $reverseComplement);
@reversedArray = reverse(@arrayOfBases);
$reverseComplement = join("",@reversedArray);
#ORF finding code
for (my $i = 0; $i < 2; $i = $i + 1) {
my $sequenceEntry = "";
my $strand = "";
if ($i == 0) {
#the first time the ORF finding code executes we will search
#for ORFs on the direct strand.
$sequenceEntry = $directStrand;
$strand = "+";
}
else {
#the second time the ORF finding code executes we will search
#for ORFs on the reverse-complement.
$sequenceEntry = $reverseComplement;
$strand = "-";
}
my @startsRF1 =();
my @startsRF2 =();
my @startsRF3 =();
my @stopsRF1 = ();
my @stopsRF2 = ();
my @stopsRF3 = ();
while ($sequenceEntry =~ m/\A(?!tag|taa|tga)/gi){
push (@startsRF1, 0);
}
while ($sequenceEntry =~ m/\A[AaCcGgTtNn](?!tag|taa|tga)/gi){
push (@startsRF2, 0);
}
while ($sequenceEntry =~ m/\A[AaCcGgTtNn]{2}?(?!tag|taa|tga)/gi){
push (@startsRF3, 0);
}
while ($sequenceEntry =~ m/atg/gi){
my $matchPosition = pos($sequenceEntry) - 3;
if (($matchPosition % 3) == 0) {
push (@startsRF1, $matchPosition);
}
elsif ((($matchPosition + 2) % 3) == 0) {
push (@startsRF2, $matchPosition);
}
else {
push (@startsRF3, $matchPosition);
}
}
while ($sequenceEntry =~ m/tag|taa|tga/gi){
my $matchPosition = pos($sequenceEntry);
if (($matchPosition % 3) == 0) {
push (@stopsRF1, $matchPosition);
}
elsif ((($matchPosition + 2) % 3) == 0) {
push (@stopsRF2, $matchPosition);
}
else {
push (@stopsRF3, $matchPosition);
}
}
my $codonRange = "";
my $startPosition = 0;
my $stopPosition = 0;
@startsRF1 = reverse(@startsRF1);
@stopsRF1 = reverse(@stopsRF1);
while (scalar(@startsRF1) > 0) {
$codonRange = "";
$startPosition = pop(@startsRF1);
if ($startPosition < $stopPosition) {
next;
}
while (scalar(@stopsRF1) > 0) {
$stopPosition = pop(@stopsRF1);
if ($stopPosition > $startPosition) {
last;
}
}
if ($stopPosition <= $startPosition) {
$stopPosition = length($sequenceEntry) - (length($sequenceEntry) % 3);
$codonRange = $strand . "1 " . $startPosition . ".." . $stopPosition;
push (@arrayOfORFs, $codonRange);
last;
}
else {
$codonRange = $strand . "1 " . $startPosition . ".." . $stopPosition;
push (@arrayOfORFs, $codonRange);
}
}
$stopPosition = 0;
@startsRF2 = reverse(@startsRF2);
@stopsRF2 = reverse(@stopsRF2);
while (scalar(@startsRF2) > 0) {
$codonRange = "";
$startPosition = pop(@startsRF2);
if ($startPosition < $stopPosition) {
next;
}
while (scalar(@stopsRF2) > 0) {
$stopPosition = pop(@stopsRF2);
if ($stopPosition > $startPosition) {
last;
}
}
if ($stopPosition <= $startPosition) {
$stopPosition = length($sequenceEntry) - ((length($sequenceEntry) + 2) % 3);
$codonRange = $strand . "2 " . $startPosition . ".." . $stopPosition;
push (@arrayOfORFs, $codonRange);
last;
}
else {
$codonRange = $strand . "2 " . $startPosition . ".." . $stopPosition;
push (@arrayOfORFs, $codonRange);
}
}
$stopPosition = 0;
@startsRF3 = reverse(@startsRF3);
@stopsRF3 = reverse(@stopsRF3);
while (scalar(@startsRF3) > 0) {
$codonRange = "";
$startPosition = pop(@startsRF3);
if ($startPosition < $stopPosition) {
next;
}
while (scalar(@stopsRF3) > 0) {
$stopPosition = pop(@stopsRF3);
if ($stopPosition > $startPosition) {
last;
}
}
if ($stopPosition <= $startPosition) {
$stopPosition = length($sequenceEntry) - ((length($sequenceEntry) + 1) % 3);
$codonRange = $strand . "3 " . $startPosition . ".." . $stopPosition;
push (@arrayOfORFs, $codonRange);
last;
}
else {
$codonRange = $strand . "3 " . $startPosition . ".." . $stopPosition;
push (@arrayOfORFs, $codonRange);
}
}
}
#Use a loop to examine each ORF description in @arrayOfORFs. Translate ORFs that are
#longer than $MINORF, and add those translations to @arrayOfTranslations
foreach(@arrayOfORFs) {
#Use the matching operator to copy parts of the ORF range into variables.
#The strand will be stored in $1, the reading frame in $2, the start of
#the ORF in $3, and the end of the ORF in $4.
$_ =~ m/([\+\-])(\d)\s(\d+)\.\.(\d+)/;
#Skip ORFs that are smaller than $MINORF
if (($4 - $3) < $MINORF) {
next;
}
#Declare a variable to hold the DNA sequence of the ORF.
my $ORFsequence = "";
#If the ORF is on the forward strand, use "substr" to obtain the sequence
#of the ORF from $directStrand. Otherwise, obtain the sequence from
#$reverseComplement.
if ($1 eq "+") {
$ORFsequence = substr($directStrand, $3, $4 - $3);
}
else {
$ORFsequence = substr($reverseComplement, $3, $4 - $3);
}
#Now use a for loop to translate the ORF sequence codon by codon.
#The amino acids are added as elements to the array @growingProtein.
my @growingProtein = ();
for (my $i = 0; $i <= (length($ORFsequence) - 3); $i = $i + 3) {
my $codon = substr($ORFsequence, $i, 3);
if (exists( $translationHash{$codon} )){
push (@growingProtein, $translationHash{$codon});
}
else {
push (@growingProtein, "X");
}
}
#Now convert the @growingProtein array to a string.
$joinedAminoAcids = join("",@growingProtein);
#Uncomment the following line to create a reversed database
#$joinedAminoAcids = reverse $joinedAminoAcids;
#it may help to add the string "Rev" to the two assignments following. . .
#Add a title to the protein sequence before adding it to @arrayOfTranslations.
#For ORFs on the reverse strand adjust the position numbers so that they refer
#to the direct strand.
if ($1 eq "+") {
$joinedAminoAcids = ">" . $entry . $1 . $2 . "_" . ($3 + 1) .
" " . ($3 + 1) . " to " . $4 . ", " . ($4 - $3) . " bases.\n" . $joinedAminoAcids;
}
else {
$joinedAminoAcids = ">" . $entry . $1 . $2 . "_" . (length($directStrand) - $4 + 1) .
" " . (length($directStrand) - $4 + 1) . " to " . (length($directStrand) - $3) .
", " . ($4 - $3) . " bases.\n" . $joinedAminoAcids;
}
#The text contained in $joinedAminoAcids can now be added on to our array
#of protein translations.
push (@arrayOfTranslations, $joinedAminoAcids);
}
foreach(@arrayOfTranslations) {
print "$_ \n";
}
}
close STDOUT;