#!/usr/bin/perl -w

#############################################################################
#### APPLY LOG TRANSFORMATION TO THE DATA
## the transformation is defined on a training set, and applied to all other sets
## given as arguments. The data is rescaled between 1 and 10, so the log becomes rescaled between
## 0 and 1 (ideal input for neural network)
## *** other values can be given for rescaling (options L and H), so the data is rescaled between 10^L and 10^H
### before applying the log10


use strict;
use Getopt::Std;			# Command line options
use Statistics::Descriptive;
use List::Util qw[min max];

########################
# 1. Get command line options. peptide file is mandatory
use vars qw($opt_f $opt_l $opt_h $opt_L $opt_H $opt_r $opt_s);
getopts('f:lhL:H:rs:');
usage() unless defined($opt_f);
usage() if $opt_h;

my $lw=0;  ## lower bound for the rescaled data
my $hg=1;  ## higher bound for the rescaled data

my $sflat=3;
$sflat=$opt_s if defined($opt_s);

$lw = $opt_L if defined($opt_L);
$hg = $opt_H if defined($opt_H);
 
my @files;
for (0..$#ARGV)  ### all other files to rescale are given as list of arguments
{
   push (@files,$ARGV[$_]);
}

my %score;
my %scores;
my @rawscores;

###########
### 2. Read training data for the transformation
my $tsq=0;
open (IN,'<',$opt_f) or die "Cant open file $opt_f: $!";
while (defined(my $l=<IN>))
{
   chomp $l;
   if ($l=~m/^(\S+)\s+(\S+)/)
   {
      my $signal=$2;
      my $pp=$1;
      
      push (@{$scores{$pp}}, $signal);
   }
   $tsq++;
}
close IN;

if ($tsq<=1)
{
    my ($dir,$fn) = &dir_and_fname($opt_f);
    $fn =  $dir . "/rr_" . $fn;

    my $cmd="cp $opt_f $fn";
    system($cmd);
    
    foreach my $f (@files)
    {
       my ($dir,$fn) = &dir_and_fname($f);
       $fn =  $dir . "/rr_" . $fn;
       $cmd="cp $f $fn";
       system($cmd);
    }	
}
else {
### average of identical sequences
foreach my $p (keys (%scores))
{
   $score{$p}=0;
   my @ss=@{$scores{$p}};
   for (0..$#ss)
   {
      $score{$p}+=$ss[$_];
   }
   $score{$p}/=($#ss+1);  ##avg
   
   push (@rawscores,$score{$p});
}

#############
### 3.minimum and maximum values in training set
my $mn=min(@rawscores);  
my $mx=max(@rawscores);

my @all;
my %rescaled;

### 4. apply logarithmic transform

my $mnlog=$mn;  ## use again for transformation on other sets

unless ($opt_l)
{   
   my @logs;
   foreach my $pep (keys %score)
   {
      my $rs= $score{$pep}-$mn+1;   ## min to 1 -> log(1)=0
      $rs=log($rs)/log(10);  ##log transform

      push(@logs,$rs);
      $score{$pep}=$rs;
   }
   
   my $stat = Statistics::Descriptive::Full->new();   # object with all datapoints
   $stat->add_data(@logs);
   my $mean=$stat->mean();
   my $sd=$stat->standard_deviation();  
  
   @logs=();

   #### flatten outliers
   foreach my $pep (keys %score)
   {
       if ($score{$pep}<$mean-$sflat*$sd)
       {
          $score{$pep}=$mean-$sflat*$sd
       }
       elsif ($score{$pep}>$mean+$sflat*$sd)
       {
          $score{$pep}=$mean+$sflat*$sd
       }
       push(@logs,$score{$pep});
   }
   
   $mn=min(@logs);  # redefine maximum and minimum
   $mx=max(@logs);    
} 

#### 5. rescale data between $lw and $hg

foreach my $pep (keys %score)
{
   my $rs;
   
   $rs=$lw+($score{$pep}-$mn)*($hg-$lw)/($mx-$mn);   #rescale from [$mn:$mx] to [$lw:$hg]

   $rescaled{$pep}=$rs;
   push(@all,$rs);
   
}

############## calculate mean and SD of the data

my $stat = Statistics::Descriptive::Full->new();   # object with all datapoints
$stat->add_data(@all);
my $mean=$stat->mean();    # this is the population mean for the ratios
my $sd = $stat->standard_deviation();  # standard deviation for the ratios

 ### a file is  automatically created with the rescaled data. the name is the same but with the rr_ prefix

my ($dir,$fn) = &dir_and_fname($opt_f);
$fn =  $dir . "/rr_" . $fn;

open (TR,'>',$fn) or die "Cannot create file $fn: $!";
foreach my $pep (keys %rescaled)
{
   my $val=$rescaled{$pep};

   if ($opt_r) {
       $val = 1-$val;
   }
   
   $val=$lw+0.001 if $val<$lw+0.001;  ##ensure that data is between $lw and $hg even after shift of the mean
   $val=$hg if $val>$hg;

   print TR "$pep\t$val\n";
 
}
close TR;

#######################################################################
### 4. apply the same transformation to all the other files (using the parameters found for the training set)

foreach my $f (@files)
{
   
   open (F,'<',$f) or die "Cant open file $f: $!";

   my ($dir,$fn) = &dir_and_fname($f);
   $fn =  $dir . "/rr_" . $fn;

   open (O,'>',$fn) or die "Cannot create file $fn: $!";
   while (defined(my $l=<F>))
   {
      chomp $l;
      if ($l=~m/^(\S+)\s+(\S+)/)
      {
	 my $signal=$2;
	 my $pp=$1;
	 
	 unless ($opt_l)  #log transform (uses minimun and maximum as defined on training set)
	 {	    
	    $signal= $signal-$mnlog+1;   
	    $signal=1 if $signal<1;     ## min to 1 -> log(1)=0
	    $signal=log($signal)/log(10);  ##log transform
	 }
	 
	 my $rs=$lw+($signal-$mn)*($hg-$lw)/($mx-$mn);   #rescale from [$mn:$mx] to [$lw:$hg]	 
	 if ($opt_r) {
	     $rs = 1-$rs;
	 }

	 $rs=$lw+0.001 if $rs<$lw+0.001;
	 $rs=$hg if $rs>$hg;

	 print O "$pp\t$rs\n";
      }
   }
   close F;
   close O;
}
}

#################################################
### SUBROUTINES

sub usage{
  print("$0\n");
  print(".\n");
  print("\n");
  print("Usage: $0 [-f train file] [-l] [-H val] [-L val] [-h] [file] [file] ...\n");
  print("Command line options:\n");
  print("  f: peptide TAB signal intensities\n");
  print("  l: just linearly rescale between 0 and 1\n");
  print("  r: reverse the values from [0..1] to [1..0] after rescaling\n");
  print("  L: lower bound after rescaling (def 0)\n");
  print("  H: higher bound after rescaling (def 1)\n");
  print("  s: flatten outliers at s standard deviations (def 3)\n");
  print("  h: print this message\n");
  exit;
}

sub dir_and_fname {

   my $path = $_[0];
   my $dir;
   my $fn;

   if ($path =~ m/(.+)\/(.+)$/) {
       $dir=$1;
       $fn=$2;
   }
   elsif ($path =~ m/(.+)/){
       $dir=".";
       $fn=$1;
   }
   else{
       die ("Invalid path\n");
   }
   return ($dir,$fn);
}
