#!/usr/bin/perl/
  
  # Authors: RAHUL SHARMA (with contributions from FRANCESCO DAL GRANDE)
  # Company: SBiK-F, Senckenberg.
  # Code: To extract the CDS sequence for SNP from the co-ordinates files.
 
  ###############################
  ## Libraries Used
  ###############################
  
 use warnings;
 use strict;
 use Getopt::Long;  # To get the multiple options;
                    # $0 to match the name of the program
 ###############################
 ## Usgae message
 ###############################
 my $usage="
perl $0 -cds <reference_cds> -snp_co <File_containing_Snp_co-ordinates> -ids <file_containing_gene_ids_with_snps> -run <Species_wise>

  Authors: RAHUL SHARMA (with contributions from FRANCESCO DAL GRANDE)
  Company: Bik-F, Senckenberg.
  Code: To extract the CDS sequence for SNP from the co-ordinates files.
 
 ";
 
 ###############################
 ## Options definition
 ###############################
 
 my ($cds_seq, $snp_file, $ids, $run, $help);
 
 GetOptions ("cds=s" => \$cds_seq,         #'s' means it would take a string value as input;   
             "snp_co=s" => \$snp_file,
             "ids=s" => \$ids,
             "run=s" => \$run,
             "h|help!"  => \$help);           #h! will not take any value;
 
   if ($help) {
          die $usage;
   }

 die "\nAll parameters are required",$usage unless( $cds_seq && $snp_file && $ids && $run);
 

###############################################
## Program variable/subroutines declarations
###############################################

my $seq_cds=fastaparser($cds_seq);
seq_generator($seq_cds, $snp_file, $ids);

sub seq_generator{
my $cds_seq_ref=$_[0]; my $snp_file=$_[1];
my %ref_seq=%{$cds_seq_ref}; my $ids_file=$_[2];

my (%I_ref, %I_pool1, %I_pool2, %I_pool3, %I_pool4, %I_pool5, %I_pool6);

for(my $d=3; $d <= 8;$d++){

my %ref_seq=%{$cds_seq_ref};

open S,"$snp_file" or die$!;
chomp (my $l=<S>);

my @sp=split(/\t/, $l);

do{
chomp ($l=<S>);
my @s=split(/\t/, $l);
if($ref_seq{$s[0]}){
my $seq=$ref_seq{$s[0]};
my $pos=$s[1]-1;
my $snp=$s[$d];
$seq =~ s/^(.{$pos})(.)(.*)$/$1$snp$3/;

$ref_seq{$s[0]}=$seq;
 }
else { print STDERR "Id: $s[0] not found in the fasta sequence file!!!\n"; }
}until eof(S);

if($d==2){%I_ref = %ref_seq;} 
if($d==3){%I_pool1 = %ref_seq;} 
elsif($d==4){%I_pool2 = %ref_seq;} 
elsif($d==5){%I_pool3 = %ref_seq;}
elsif($d==6){%I_pool4 = %ref_seq;}
elsif($d==7){%I_pool5 = %ref_seq;}
elsif($d==8){%I_pool6 = %ref_seq;}


if($run=~ /^Species_wise$/i){
open ID, "<$ids_file" or die$!;
open SEQ, ">$sp[$d]\.fa" or die$!;
do{

chomp (my $k=<ID>);

if($ref_seq{$k}) {print SEQ ">$k\_$sp[$d]\n$ref_seq{$k}\n";}

else{print STDERR "Id: $k not in the snp ids\n";}

  }until eof(ID);}
}
   if($run =~ /^Id_wise$/i)
  {
open ID, "<$ids_file" or die$!;
do{
chomp (my $k_id=<ID>);
open SR,">$k_id.fa" or die$!;
if($ref_seq{$k_id}) {
print SR ">$k_id\_I_Pool1\n$I_pool1{$k_id}
>$k_id\_I_pool2\n$I_pool2{$k_id}
>$k_id\_I_pool3\n$I_pool3{$k_id}
>$k_id\_I_pool4\n$I_pool4{$k_id}
>$k_id\_I_pool5\n$I_pool5{$k_id}
>$k_id\_I_pool6\n$I_pool6{$k_id}\n"
     }
close SR;   }until eof(ID)
 }
   if($run =~ /^Missed_snp$/i)
  {
open ID, "<$ids_file" or die$!;
do{
chomp (my $k_id=<ID>);
#print "\n##### Gene id: $k_id #####\n";
print "\n";
if($ref_seq{$k_id}) {
if($I_ref{$k_id}=~/\?/){ print "I_ref\t";}
if($I_pool1{$k_id}=~/\?/) {print"I_pool1\t";}
if($I_pool2{$k_id}=~/\?/) {print"I_pool2\t";}
if($I_pool3{$k_id}=~/\?/) {print"I_pool3\t";}
if($I_pool4{$k_id}=~/\?/) {print"I_pool4\t";}     
if($I_pool5{$k_id}=~/\?/) {print"I_pool5\t";}
if($I_pool6{$k_id}=~/\?/) {print"I_pool6\t";}
	}
close SR;   }until eof(ID);
print "\n"; 
}
}

sub fastaparser {
    my $filename = $_[0];  # contigs.fasta file

    my %fasta;
    my $count;
    open (FILE, $filename) or die$!; # open file

    my $line = <FILE>;
    
    do {
        if ($line =~ /^>/)  #grep Id
        {
            my $seq; chomp $line;
            my @ln=split(/>/, $line);  #split on the basis of "_"
            my $id = $ln[1];          #take 4th part i.e Lenght of the sequence as Id.
            chomp $id;

            do {
                $line = <FILE>;
                chomp $line;
                $seq .= $line unless $line =~ /^>/;  #concatenate the sequence till the next ">" character.
                }
            until ($line =~ /^>/ or eof(FILE));

            $fasta{$id} = $seq;
         }
        else {
            $line = <FILE>;
              }
        } until (eof(FILE));    

    return \%fasta;    #Return the entire hash reference.
  }

#Example of 'File_containing_Snp_co-ordinates':
#Gene_id Base_position   ref     pool1   pool2   pool3   pool4   pool5   pool6
#FUN_00578       30      G       A       G       A       G       A       G