#!/usr/bin/perl -w #!perl #!/usr/bin/perl ############################################################################# # Copyright 2007 # # Purpose: Compile a non-redundant list of proteins identified from a # # list of peptides. Will first remove unwanted peptides matched # # in a separate library (i.e. remove keratin peptides). Will # # Will also keep subsets proteins which may have been identified # # but for which no specific information exists # ############################################################################# ############################################################################# # UBC Centre for Proteomics # # Organelle Proteomics Group # # Leonard Foster # # E-mail: ljfoster@interchange.ubc.ca # # WWW: http://www.proteomics.ubc.ca/foster/ # # # # # # FILENAME: finalList .pl # # # # CREATED: LJF 01-21-2007 # # UPDATED: LJF xx-xx-2007 # # # # # ############################################################################# # Usage: perl finalList.pl -f -u -p -o # Not necessary to use -u use Getopt::Std; my %options; getopt ('fupo', \%options); $sequence = ""; %protHits = %pepsToProts = %protsToPeps = (); #read in all the input files #First the full fasta library to search against open (STDIN, $options{"f"}) || die "Can't open Fasta file!\n"; while (<>) { if ($_ =~ m/(>.+)/) { $sequence .= "\n" . $1 . "\t"; } elsif ($_ =~ m/([A-Za-z]+)/) { $sequence .= $1; } } close STDIN; #then the unwanted proteins, if option is used if ($options{"u"}) { open (STDIN, $options{"u"}) || die "Can't open file with unwanted sequences!\n"; while (<>) { if ($_ =~ m/(>.+)/) { $unwanted .= "\t"; } elsif ($_ =~ m/^([A-Za-z]+)/) { $unwanted .= $1; } } close STDIN; } #then the peptides open (STDIN, $options{"p"}) || die "Can't open peptide file!\n"; while (<>) { if ($_ =~ m/([A-Za-z]+)/) { $pepsToProts{$1} = ""; } } close STDIN; #Now process the peptide list #filter out unwanted peptides if option 'u' was used if ($options{"u"}) { foreach $peptide (keys %pepsToProts) { if ($unwanted =~ m/$peptide/) { delete $pepsToProts{$peptide}; } } } #find fasta entries that each peptide occurs in #map proteins to peptides and vice versa foreach $peptide (keys %pepsToProts) { %protHits = $sequence =~ m/(>.+)\t.+($peptide)/; foreach $protein (keys %protHits) { if ($pepsToProts{$peptide} eq "") { $pepsToProts{$peptide} = $protein; } else { $pepsToProts{$peptide} .= "\t$protein"; } if ($protsToPeps{$protein}) { $protsToPeps{$protein} .= "\t$peptide"; } else { $protsToPeps{$protein} = $peptide; } } } #now distill the lists down to a non-redundant collection foreach $hash (sort keys %pepsToProts) { if ($pepsToProts{$hash} =~ m/\t/) #this will miss peptides found in only one protein { @prots = split(/\t/, $pepsToProts{$hash}); foreach $protein (@prots) { if (exists($protsToPeps{$protein})) { @basePeps = split(/\t/, $protsToPeps{$protein}); #this also misses proteins with one peptide foreach $testProt (@prots) { if (($testProt ne $protein) && ($protsToPeps{$testProt})) { #%testPeps = (); @testPeps = split(/\t/, $protsToPeps{$testProt}); #foreach $dilly (@testPeps) #{ # $testPeps{$dilly} = 1; #} @intersect = (); foreach $dally (@basePeps) { if (exists($testPeps[$dally])) { @intersect = (@intersect,$dally); } } if (@intersect == @basePeps) { if (@basePeps >= @testPeps) { delete $protsToPeps{$testProt}; } else { delete $protsToPeps{$protein}; } } } } } } } } #Count number of times each peptide is used in new hash $pepCount = $numPeps = (); foreach $protein (keys %protsToPeps) { @basePeps = split(/\t/, $protsToPeps{$protein}); #at the same time count the number of peptides for each protein if (@basePeps < 10) { $numPeps{$protein} = "00" . @basePeps; } elsif (@basePeps < 100) { $numPeps{$protein} = "0" . @basePeps; } else { $numPeps{$protein} = @basePeps; } foreach $pep (@basePeps) { if (exists($pepCount{$pep})) { $integ = $pepCount{$pep} + 1; $pepCount{$pep} = $integ; } else { $pepCount{$pep} = 1; } } } #Then go through each protein again and check that each has at least one unique peptide #Start from the protein with the fewest peptides foreach $protein (sort {$numPeps{$a} cmp $numPeps{$b}} keys %numPeps) { @basePeps = split(/\t/, $protsToPeps{$protein}); $multiUse = 1; foreach $pep (@basePeps) { if ($pepCount{$pep} == 1) { $multiUse = 0 } } if ($multiUse == 1) { foreach $pep (@basePeps) { $integ = $pepCount{$pep} - 1; $pepCount{$pep} = $integ; } delete $protsToPeps{$protein}; } } #output all data $acc = $name = ""; open (STDOUT, ">", $options{"o"}) || die "Can't create output file!\n"; foreach $hash (sort keys %protsToPeps) { if ($hash =~ m/>(.+?)\|/) { $acc = $1; } if ($hash =~ m/Tax_Id=\d+\s(.+)/) { $name = $1; } print "$acc\t$name\t$numPeps{$hash}\t$protsToPeps{$hash}\n"; } close STDOUT;