#!/usr/bin/perl/ # Authors: RAHUL SHARMA (with contributions from FRANCESCO DAL GRANDE) # Company: SBiK-F, Senckenberg. # Code: To extract the CDS sequence for SNP from the co-ordinates files. ############################### ## Libraries Used ############################### use warnings; use strict; use Getopt::Long; # To get the multiple options; # $0 to match the name of the program ############################### ## Usgae message ############################### my $usage=" perl $0 -cds -snp_co -ids -run Authors: RAHUL SHARMA (with contributions from FRANCESCO DAL GRANDE) Company: Bik-F, Senckenberg. Code: To extract the CDS sequence for SNP from the co-ordinates files. "; ############################### ## Options definition ############################### my ($cds_seq, $snp_file, $ids, $run, $help); GetOptions ("cds=s" => \$cds_seq, #'s' means it would take a string value as input; "snp_co=s" => \$snp_file, "ids=s" => \$ids, "run=s" => \$run, "h|help!" => \$help); #h! will not take any value; if ($help) { die $usage; } die "\nAll parameters are required",$usage unless( $cds_seq && $snp_file && $ids && $run); ############################################### ## Program variable/subroutines declarations ############################################### my $seq_cds=fastaparser($cds_seq); seq_generator($seq_cds, $snp_file, $ids); sub seq_generator{ my $cds_seq_ref=$_[0]; my $snp_file=$_[1]; my %ref_seq=%{$cds_seq_ref}; my $ids_file=$_[2]; my (%I_ref, %I_pool1, %I_pool2, %I_pool3, %I_pool4, %I_pool5, %I_pool6); for(my $d=3; $d <= 8;$d++){ my %ref_seq=%{$cds_seq_ref}; open S,"$snp_file" or die$!; chomp (my $l=); my @sp=split(/\t/, $l); do{ chomp ($l=); my @s=split(/\t/, $l); if($ref_seq{$s[0]}){ my $seq=$ref_seq{$s[0]}; my $pos=$s[1]-1; my $snp=$s[$d]; $seq =~ s/^(.{$pos})(.)(.*)$/$1$snp$3/; $ref_seq{$s[0]}=$seq; } else { print STDERR "Id: $s[0] not found in the fasta sequence file!!!\n"; } }until eof(S); if($d==2){%I_ref = %ref_seq;} if($d==3){%I_pool1 = %ref_seq;} elsif($d==4){%I_pool2 = %ref_seq;} elsif($d==5){%I_pool3 = %ref_seq;} elsif($d==6){%I_pool4 = %ref_seq;} elsif($d==7){%I_pool5 = %ref_seq;} elsif($d==8){%I_pool6 = %ref_seq;} if($run=~ /^Species_wise$/i){ open ID, "<$ids_file" or die$!; open SEQ, ">$sp[$d]\.fa" or die$!; do{ chomp (my $k=); if($ref_seq{$k}) {print SEQ ">$k\_$sp[$d]\n$ref_seq{$k}\n";} else{print STDERR "Id: $k not in the snp ids\n";} }until eof(ID);} } if($run =~ /^Id_wise$/i) { open ID, "<$ids_file" or die$!; do{ chomp (my $k_id=); open SR,">$k_id.fa" or die$!; if($ref_seq{$k_id}) { print SR ">$k_id\_I_Pool1\n$I_pool1{$k_id} >$k_id\_I_pool2\n$I_pool2{$k_id} >$k_id\_I_pool3\n$I_pool3{$k_id} >$k_id\_I_pool4\n$I_pool4{$k_id} >$k_id\_I_pool5\n$I_pool5{$k_id} >$k_id\_I_pool6\n$I_pool6{$k_id}\n" } close SR; }until eof(ID) } if($run =~ /^Missed_snp$/i) { open ID, "<$ids_file" or die$!; do{ chomp (my $k_id=); #print "\n##### Gene id: $k_id #####\n"; print "\n"; if($ref_seq{$k_id}) { if($I_ref{$k_id}=~/\?/){ print "I_ref\t";} if($I_pool1{$k_id}=~/\?/) {print"I_pool1\t";} if($I_pool2{$k_id}=~/\?/) {print"I_pool2\t";} if($I_pool3{$k_id}=~/\?/) {print"I_pool3\t";} if($I_pool4{$k_id}=~/\?/) {print"I_pool4\t";} if($I_pool5{$k_id}=~/\?/) {print"I_pool5\t";} if($I_pool6{$k_id}=~/\?/) {print"I_pool6\t";} } close SR; }until eof(ID); print "\n"; } } sub fastaparser { my $filename = $_[0]; # contigs.fasta file my %fasta; my $count; open (FILE, $filename) or die$!; # open file my $line = ; do { if ($line =~ /^>/) #grep Id { my $seq; chomp $line; my @ln=split(/>/, $line); #split on the basis of "_" my $id = $ln[1]; #take 4th part i.e Lenght of the sequence as Id. chomp $id; do { $line = ; chomp $line; $seq .= $line unless $line =~ /^>/; #concatenate the sequence till the next ">" character. } until ($line =~ /^>/ or eof(FILE)); $fasta{$id} = $seq; } else { $line = ; } } until (eof(FILE)); return \%fasta; #Return the entire hash reference. } #Example of 'File_containing_Snp_co-ordinates': #Gene_id Base_position ref pool1 pool2 pool3 pool4 pool5 pool6 #FUN_00578 30 G A G A G A G