#!/usr/bin/perl -w
### Stand-alone version.
use strict;

##### SET CUSTOM PATHS HERE ############################

my $R = "";
my $weblogo = "";

########################################################

#use FindBin qw($bin);
#use lib "$bin/lib";
use Getopt::Std;			# Command line options
use List::Util qw[min max];
use FileHandle;
use File::Spec;
my $bin;
BEGIN {
	$bin = File::Spec->rel2abs($0); #absolute path of caller
	$bin = readlink($bin) if -l $bin;  #called from symlink?
	$bin =~ s/(.*)(\/\S+)/$1/;
}
use lib "$bin/lib";   #add library to path
use Parallel::ChildManager;
use Statistics::Descriptive;

########################
# 1. Get command line options. Input file is mandatory (in format Seq TAB Value)
use vars qw($opt_a $opt_b $opt_c $opt_d $opt_e $opt_f $opt_h $opt_j $opt_l $opt_m $opt_n $opt_p $opt_r $opt_s $opt_t $opt_u $opt_x $opt_y $opt_A $opt_B $opt_C $opt_E $opt_F $opt_H $opt_I $opt_L $opt_M $opt_O $opt_P $opt_Q $opt_R $opt_S $opt_T $opt_U $opt_X $opt_Y $opt_W $opt_Z);
getopts('a:b:c:d:e:f:hj:l:m:n:pr:s:t:u:x:yAB:C:EFH:I:LM:OP:R:QST:U:XY:W:Z:');
unless (defined($opt_f) || defined($opt_M))
{
    usage();
    
    print "ERROR. No training data OR trained method uploaded\n";
	exit;
}
usage() if $opt_h;

### Platform type
my $platform = qx(uname -sm);
chomp $platform;

my $subbin;

if ($platform eq "Linux x86_64") {
    $subbin = "Linux_x86_64";
} elsif ($platform eq "Linux ia64") {
    $subbin = "Linux_ia64";
} elsif ($platform eq "Darwin i386") {
    $subbin = "Darwin_i386";
} elsif ($platform eq "Darwin x86_64") {
    $subbin = "Darwin_x86_64";
} else {
    print "Platform: $platform not supported.";
    exit(0);
}

my $pwd = qx(pwd);  ##current working directory
chomp $pwd;
my $resdir=$pwd;     ##results go to current directory, unless specified
$resdir="$opt_r" if defined($opt_r); 

my $sID=$$;  #session ID
$sID++ while(-e "$resdir/$sID");  ##find a non-used session ID
my $prefix=$sID;
$prefix=$opt_P if defined($opt_P);  #use custom prefix for files

$resdir .= "/$sID";

my $procs=10; ##number of parallel processes
$procs=$opt_c if defined($opt_c);
die "Option error: -c $procs must be a positive integer\n" unless isInt($procs);

my $reverse_data=0;
$reverse_data=$opt_Y if defined($opt_Y);
die "Option error: -Y $reverse_data must be 0 or 1\n" unless ($reverse_data eq "0" || $reverse_data eq "1");

my $folds=5;  ##folds for cross-validation
$folds=$opt_a if defined($opt_a);
die "Option error: -a $folds must be a positive integer\n" unless isInt($folds);

my $nseeds=10; ##number of seeds for NN training
$nseeds=$opt_s if defined($opt_s);
die "Option error: -s $nseeds must be a positive integer\n" unless isInt($nseeds);

my $nh=3;  ##number of hidden neurons in ANNs
$nh=$opt_n if defined($opt_n);
my @nh;

if (defined($opt_n) && $opt_n=~ m/(\S+)/) { 
   @nh = split(/,/,$1);
} else {
    $nh[0]=3;
}

for (0..$#nh) {
     die "Option error: -n $opt_n must be a positive integer, or a comma separated list of integers\n" unless isInt($nh[$_]);
}

my $Bo=1;
$Bo=$opt_B if defined($opt_B);
die "Option error: -B $Bo must be either 0 (Sparse), 1 (Blosum) or 2 (combined)\n" unless ($Bo eq "0" || $Bo eq "1" || $Bo eq "2");

my @bl_enc=(0); ## default encoding is Sparse
@bl_enc=(1) if $Bo==1;
@bl_enc=(0,1) if $Bo==2;  ##combined encoding

my $cycles=500; ##number of training cycles
$cycles=$opt_C if defined($opt_C);
die "Option error: -C $cycles must be a positive integer\n" unless isInt($cycles);

my $cval=0; ## do extensive cross-validation?
$cval=$opt_e if defined($opt_e);
die "Option error: -e $cval must be either 0 or 1\n" unless ($cval eq "0" || $cval eq "1");

my $dolog=0; ## rescale the data (linear, log, or none)
$dolog=$opt_l if defined($opt_l);
die "Option error: -l $dolog must be either 0,1 or 2\n" unless ($dolog eq "0" || $dolog eq "1" || $dolog eq "2");

my $dostop=0; ## stop on best test set performance
$dostop=1 if defined($opt_y);

$dostop=0 if $folds<3; ##can't stop with less than 3 sets (1 train + 1 stop + 1 eval)
$cval=0 if $dostop==0;  ## with no stop set there is no need for extra test set

### MOTIF LENGTH
my $maxl=9; #maximum expected motif length
my $minl=9; #minimum expected motif length
my $stepl=1; #step in motif lengths

if (defined($opt_m))
{
   if ($opt_m =~ /(\d+)-(\d+)\/(\d+)/)  #range in format: [min]-[max]/[step]
   {
	$minl=$1;
	$maxl=$2;
	$stepl=$3;
	if ($maxl<$minl) {
	   my $tmp=$maxl;
	   $maxl=$minl;
	   $minl=$tmp;
	}    
   }
   elsif ($opt_m =~ /(\d+)-(\d+)/)  #range with step=1
   {
	$minl=$1;
	$maxl=$2;
	if ($maxl<$minl) {
	   my $tmp=$maxl;
	   $maxl=$minl;
	   $minl=$tmp;
	}    
   }
   elsif ($opt_m =~ /(\d+)/) #single value: try only this motif length
   {
	$maxl=$1; 
	$minl=$1;
   }   
   else
   {
	  die "Error in motif length range. -m [length] or -m [min]-[max]/[step]\n";
   }
}

my $bestl=$minl;
my $bestRMSE=9999;

## flanks?
my $fl=0;
$fl = $opt_j if defined($opt_j);

## encode peptide length
my $encL=0;
$encL=1 if defined($opt_L);

## encode PFR length
my $encF=0;
$encF=1 if defined($opt_F);

## do random split, homology clustering, or common motif
my $do_CM=0;
$do_CM=1 if (defined($opt_H) && $opt_H eq 1);
$do_CM=2 if (defined($opt_H) && $opt_H eq 2);

## threshold for common motif
my $CMthr=5;
$CMthr=$opt_I if defined($opt_I);
die "Option error: -I must be an integer number\n" unless isInt($CMthr);

#similarity threshold for Hobohm1 reduction
my $thrH=0.8;  
$thrH=$opt_t if defined($opt_t);
die "Option error: -t $thrH must be a number\n" unless isAnumber($thrH);

## temperature start for PSSM align
my $ts_pssm=0.10;
$ts_pssm=$opt_u if defined ($opt_u);

## number of iterations for PSSM align
my $i_pssm=2000;
$i_pssm=$opt_Z if defined($opt_Z);

## Bits in the logo
my $logoBits=4;
$logoBits=$opt_U if (defined($opt_U) && isInt($opt_U) && $opt_U>=1 && $opt_U<=4);

## threshold on Evaluation set predictions
my $Xthr=0;
$Xthr=$opt_T if defined($opt_T);
die "Option error: -T $Xthr must be a number\n" unless isAnumber($Xthr);

#### top $bs networks for final ensemble
my $bs=20;
$bs=$opt_b if defined($opt_b);

#### Pos 1 hydrophobic preference
my $p1a=0;
$p1a=1 if defined($opt_p);

##################
## child manager
my $cm=new ChildManager($procs);

#### binary and data directory
my $mdir="$bin/bin";
$mdir = $opt_d if defined($opt_d);
my $matdir="$bin/data";

##dedicated working directory 
my $wd="$resdir.tmp";

my $c="mkdir -p $wd";
system($c)==0 or die "Failed to execute $c: $?\n";  #failed to create working directory
$c="mkdir -p $resdir";
system($c)==0 or die "Failed to execute $c: $?\n";  #failed to create results directory

$c = "mkdir $wd/nnres; mkdir $wd/synfiles";
system($c);

my $outf="$wd/$prefix" . ".data.distrib.pdf";

## R and perl
$R = $opt_R if defined($opt_R);
my $rscripts="$mdir/Rscripts";
my $use_R = length($R)>0 ? 1 : 0; #path to R defined?  

my $perl = qx(which perl);
chomp $perl;
if (length($perl)<1 || $perl =~ m/not found/) {
	die "It appears that you don't have Perl installed. Died.";
}

##awk or gawk
my $awk = qx(which gawk);
chomp $awk;
if (length($awk)<1 || $awk =~ m/not found/) {
	$awk = qx(which awk);
	chomp $awk;
}

## Weblogo
$weblogo = $opt_W if defined($opt_W);
my $use_weblogo = length($weblogo)>0 ? 1 : 0; #weblogo is installed?

my $cmd;
if ($use_weblogo) {
## Ghost script and convert, for Weblogo
	my $gs = qx(which gs);
	chomp $gs;
	my $convert = qx(which convert);
	chomp $convert;
	if (length($gs)<1 || $gs =~ m/not found/) {
		print "## Warning: it appears that you don't have 'gs' installed. Logos may not be displayed\n";
	}
	if (length($convert)<1 || $convert =~ m/not found/) {
		print "## Warning: it appears that you don't have 'convert' installed. Logos may not be displayed\n";
	}
}

## NN align code + scripts
my $smm_train="$mdir/$subbin/smm_align_flank_gd_nn_pssm";
my $smm_play="$mdir/$subbin/smm_align_flank_gd_nn_play_pssm";
my $hobohm1="$mdir/$subbin/hobohm1_id";
my $logtransf="$mdir/$subbin/logtransf.pl";
my $common_motif="$mdir/$subbin/common_motif_long";
my $seqlogo="$weblogo/seqlogo";
my $pssm_align="$mdir/$subbin/gibbss_mc.pssmalign.RL"; 

my $makefsa="$mdir/$subbin/fsafile.pl";
my $args="$mdir/$subbin/args";
my $splitfile="$mdir/$subbin/splitfile";
my $pep2mat="$mdir/$subbin/pep2mat";
my $xycorr="$mdir/$subbin/xycorr";
my $xyerror="$mdir/$subbin/xyerror";

my $AA="ACDEFGHIKLMNPQRSTVWY";
my $blf="$matdir/blosum62.freq_rownorm";

### display all options

my $buff.="#Read command line. Process ID: $sID\n";
$buff .="#Architecture: $platform\n";
$buff.="#Call:\n$0 ";
$buff.="-f $opt_f " if defined($opt_f);
$buff.="-x $opt_x " if defined($opt_x);
$buff.="-M $opt_M " if defined($opt_M);
$buff.="-P $opt_P " if defined($opt_P);
$buff.="-m $opt_m " if defined($opt_m); 
$buff.="-j $opt_j " if defined($opt_j);
$buff.="-L " if defined($opt_L);
$buff.="-F " if defined($opt_F);
$buff.="-l $opt_l " if defined($opt_l);
$buff.="-Y $opt_Y " if defined($opt_Y);
$buff.="-A " if defined($opt_A);
$buff.="-H $opt_H " if defined($opt_H);
$buff.="-t $opt_t " if defined($opt_t);
$buff.="-I $opt_I " if defined($opt_I);
$buff.="-E " if defined($opt_E);
$buff.="-a $opt_a " if defined($opt_a);
$buff.="-e $opt_e " if defined($opt_e);
$buff.="-y " if defined($opt_y);
$buff.="-p " if defined($opt_p);
$buff.="-B $opt_B " if defined($opt_B);
$buff.="-n $opt_n " if defined($opt_n);
$buff.="-s $opt_s " if defined($opt_s);
$buff.="-C $opt_C " if defined($opt_C);
$buff.="-c $opt_c " if defined($opt_c);
$buff.="-u $opt_u " if defined($opt_u);
$buff.="-Z $opt_Z " if defined($opt_Z);
$buff.="-T $opt_T " if defined($opt_T);
$buff.="-O " if defined($opt_O);
$buff.="-b $opt_b " if defined($opt_b);
$buff.="-U $opt_U " if defined($opt_U);
$buff.="-Q " if defined($opt_Q);
$buff.="-S " if defined($opt_S);
$buff.="-r $opt_r " if defined($opt_r);
$buff.="-d $opt_d " if defined($opt_d);
$buff.="-R $opt_R " if defined($opt_R);
$buff.="-W $opt_W " if defined($opt_W);
$buff.="-X " if defined($opt_X);

print "$buff\n";

print $use_R==1 ? "#R path: $R\n" : "#Not using R for plots - set path in the script or specify path with option -R\n";
print $use_weblogo==1 ? "#Weblogo path: $weblogo\n" : "#No sequence logos - set path in the script or specify path with option -W\n";

my (%target,%pred);  #hash of arrays (stores predictions for all datapoints)
my (%rmse,%rmseERR); ##square error on key=motif length
my (%pears,%pearsERR);
my %rf;  ##prediction files with key=motif length
my %synapses;
my $mg;
my ($netens, $totnetens);

my $modID;
my %modfile;
my (%pearson,%spearman,%rmseB);
my $minthreshold=10;

####################################################
## 2a. If a model is uploaded, skip all the training and evaluation
if (defined($opt_M))
{
    #LOAD MODEL
    open (TM,'<',$opt_M) or die "Cannot open model file $opt_M: $!";

    my @options;
    if (defined(my $l=<TM>))  ##parse first line
    {
	if ($l=~/-ID=(\S+) -m=(\S+) -j=(\S+) -n=(\S+) -F=(\S+) -L=(\S+) -B=(\S+) -C=(\S+) -s=(\S+) -l=(\S+) -y=(\S+) -e=(\S+) -a=(\S+) -b=(\S+) -pe=(\S+) -sp=(\S+) -rmse=(\S+)/)
	{
	    $modID=$1;
	    $bestl=$2;
	    $fl=$3;
	    $nh=$4;
	    $encF=$5;
	    $encL=$6;
	    $Bo=$7;
	    $cycles=$8;
	    $nseeds=$9;
	    $dolog=$10;
	    $dostop=$11;
	    $cval=$12;
	    $folds=$13;
	    $netens=$14;
	    $pearson{$bestl}=$15;
	    $spearman{$bestl}=$16;
	    $rmseB{$bestl}=$17;
        }
	else
	{
	    print "Unable to load MODEL file. Corrupt format.\n";
	    exit;
	}
    }
    close TM;
    goto MODEL;
}

###################################
## 2b. check INPUT data is in the right format
## also remove flanks if relevant

my $dp=0; #total datapoints
my $seqr=0; ##removed sequences
my (%pp,%mpp);
my @allsequences; ##all sequences, in original order
my ($fL,$fR)=("","");
my (%bbsyn,%bbsyn_cv);  ##best performing networks
my %offset; 

open (IN,'<',$opt_f) or die "Cant open file $opt_f: $!";
while (defined(my $l=<IN>))
{
   chomp $l;
   next if length($l)<=1;  #don't kill on empty lines

   if ($l=~m/^(\S+)\s+(\S+)/)
   {
      my $signal=$2;
      my $pep=$1;
      $pep =~ tr/a-z/A-Z/;

      if (length($pep)<$maxl)
      {
		  $seqr++;
		  print ("#--Sequence $pep removed. shorter than maximum motif length ($maxl)\n");
		  next;
      }

      $dp++;
## check right number format and AA letters
      if (isAnumber($signal)==0) {
		  print "Wrong format at line $dp (not a number):$l\n";
		  exit(0);
      }
      if ($pep=~m/[^ACDEFGHIKLMNPQRSTVWYX]/) {
         print "Wrong format at line $dp (unknown amino acid):\n$l\n"; 
	     exit(0);
      }

      if ($dp==1) ##first valid sequence
      {
         $fL=$pep;
         $fR=$pep;
      }
      else
      {
  	  ## are there flanks repeated in all sequences?
 	  my $a=length($fL);
	  while ($a>0 && substr($fL,0,$a) ne substr($pep,0,$a))
	  {
	     $a--;
 	  }
	
	  my $b=length($fR);
	  while ($b>0 && substr($fR,-$b) ne substr($pep,-$b))
	  {
	      $b--;
	  }
	  $fL = $a>0 ? substr($pep,0,$a) : "";
	  $fR = $b>0 ? substr($pep,-$b) : "";
      }    
      push (@{$pp{$pep}},$signal);
      push (@allsequences,$pep);
   }
   else
   {
       print "Wrong format at line $dp:\n$l\n";
       exit(0);
   }
}
close IN;

print "#Left flank detected: $fL\n" if length($fL)>0;
print "#Right flank detected: $fR\n" if length($fR)>0; 

####################
### average identical sequences (%mpp)
my @all;
my $dun=0;
my $dunC=0;
my $hom_rem=0;
for my $k (keys %pp)
{
	my $mean=0;
    my @arr=@{$pp{$k}};
    for (0..$#arr) {
    	$mean += $arr[$_];
    }
    $mpp{$k}=$mean/($#arr+1);
    push(@all,$k);
    $dun++;
}

if ($dun<$minthreshold)
{
    print ("ERROR. Insufficient input data ($dun different sequences)\nNote that sequences shorter than MAX motif length are removed\n");
    exit;
}
print "#Loaded input data ($dp datapoints, $dun unique). Working directory $wd\n";

##############################################################
## sort sequences by peptide length (for hobohm1 reduction)
my @sorted;
if ($do_CM==1)
{      @sorted = 
      map { $_->[0] }
      sort { $b->[1] <=> $a->[1] }
      map { [ $_, length($_) ] } @all;
}
else {  #save in random order
    foreach my $k (keys %mpp)
    {
	    push(@sorted, $k);
    }
}
##################################
## save to local file without flanks

my $of="$wd/${prefix}.dat";
my $fsa="$wd/${prefix}.fsa";
my $length_png="$wd/${prefix}.lengthVSperf.png";

open (OUT,'>',$of) or die "Cannot create file $of: $!";
open (FSA,'>',$fsa) or die "Cannot create file $fsa: $!";
for (my $k=0; $k<=$#sorted; $k++)
{
    my $sh;
    my $seq=$sorted[$k];
    my $sig=$mpp{$seq};
 	    
    if (length($fR)>0) {
	 $sh = substr($seq,length($fL),-length($fR));
    }
    else {
         $sh = substr($seq,length($fL));
    }
    $mpp{$sh}=$sig;
    print OUT "$sh\t$sig\n";
    print FSA ">$sh $sig\n$sh\n";
}
close OUT;
close FSA;

die "File error. maybe no space on disk?\n" unless (-s $of);  

######################################################
## 3. Rescale data and log-transform. For a view of data distribution, the actual 
## rescaling is done on the cross-validation sets

my ($dir,$fn) = &dir_and_fname($of);
$fn =  $dir . "/rr_" . $fn;

if ($dolog==2) {  #do nothing
    $cmd = "cp $of $fn";
    system($cmd);
} else {
    $cmd = "$perl -I $bin/lib $logtransf -f $of -L 0 -H 1 -s3";
    $cmd .= " -l" if $dolog==0;  ##just linear rescale
    $cmd .= " -r" if $reverse_data==1;  #low values are positive examples

    system($cmd);
}
print "#Data rescaling complete\n";

####################################################
##  4. plot distributions before and  after processing

if ($use_R) {
   $cmd = "cat $rscripts/data.distr.2w.R | $R --vanilla --args $of $fn 'Raw data distribution' 'Data distribution after rescaling' $outf > $wd/log.txt";
   system($cmd);
}
print "#Motif length interval: $minl to $maxl with step=$stepl\n";

#### !! if NO cross-validation (folds==1) skip the following section and goto NOCV #
my $trainfile;
if ($folds==1)
{
    $trainfile="$wd/f000XX";

    open (IN,'<',"$fn") or die "Cannot open file $fn: $!";
    open (OUT,'>',$trainfile) or die "Cannot create file $trainfile: $!";
    while (defined(my $l=<IN>))
    {
        if ($l =~ /(\S+)\s+(\S+)/) {
            my $nseq = defined($opt_A) ? "${fL}${1}${fR}" : $1;	   
            print OUT "$nseq\t$2\n";
        }
        else {
            die "Format error at line:\n$l\n";
        }
    }
    close IN;
    close OUT;

    $bestl=$minl;
    $dunC = $dun;
    for (my $lgt=$minl; $lgt<=$maxl; $lgt+=$stepl)
    {
		$pearson{$lgt}=0;
		$spearman{$lgt}=0;
		$rmseB{$lgt}=0;
    }
    goto NOCV;  ###jump over all cross-validation steps
}

###################################################
## 5. create subsets for cross-validation
# 5.1 run Hobohm 1 algorithm if $do_CM==1, common motif ==2, random ==0

if ($do_CM==1) #do homology reduction
{
    my $cmd = "$hobohm1 -gf 1000 -gn 1000 -thr $thrH -blf $matdir/BLOSUM50 $fsa > $wd/hobohm1.out";
    system($cmd);

    ### divide peptides into subsets ($folds)
    my $g=0;
    my %group;

    open (HOB,'<',"$wd/hobohm1.out") or die "Cannot open file $wd/hobohm1.out: $!";
    while (defined(my $l=<HOB>))
    {
	if ($l=~m/Unique\.\s+\S+\s+(\S+)/) {  #aim at dividing in groups of equal size

	    $group{$1}=$g;
	    $g = $g==$folds-1 ? 0 : $g+1;
	}
	elsif ($l=~m/Not unique\.\s+\S+\s+(\S+).*homolog to (\S+)/) {
	    $group{$1} = $group{$2} unless defined($opt_E);
	}
    }
    close HOB;

    my @fh;
    for (my $g=0; $g<$folds; $g++) {

	my $fh = FileHandle->new();
	push (@fh, $fh);
	open ($fh,'>',"$wd/c00$g") or die "Cannot create file $wd/c00$g: $!";
    }
    foreach my $k (keys %group)
    {
	my $g=$group{$k};
	print {$fh[$g]} "$k\t$mpp{$k}\n";
	$dunC++;
    }
    for (my $g=0; $g<$folds; $g++) {
	close $fh[$g];

	my $s = `wc $wd/c00$g | $args 1`;  #check that could create subsets
	die "Threshold on similarity $thrH too strict. Cannot create subsets.\n" if $s==0;
    }
}
elsif ($do_CM==2)  ### split using common motif
{
    my $cml = $CMthr;

   $cmd = "cat $of | $common_motif -l $cml -- | grep -v '#' > $wd/common_motif.out; ";
   $cmd .= "cat $wd/common_motif.out | grep ^Acc | $args 2,3,4 > $wd/Accepted; ";
   $cmd .= "$splitfile -nc $folds -cpfix \"$wd/a\" -fpfix \"$wd/f\" $wd/Accepted > /dev/null; rm -f $wd/f00?"; 
   system($cmd);

   my $flag=0; #goes to 1 if any subset has size 0 
   for (my $i=0; $i<$folds; $i++)
   {
       my $csize = `wc $wd/a00$i | $args 1`;
       $flag=1 if $csize==0;  ##check there is at least one seq per split
   }

   if ($flag==1)  ##just split randomly. The sequences are too similar (or selected motif too short compared to sequence length)
   {
       my $sp = split_file($of,$folds,"$wd/c00",1);
   }
   else
   {
       for (my $i=0; $i<$folds; $i++)
       {
	   if (defined($opt_E))  ##strict. remove similar sequences!
	   {
	       $cmd = "cp $wd/a00$i $wd/c00$i";
	       system($cmd);
	   }
	   else
	   {
	       $cmd = "rm -f $wd/c00$i.add; touch $wd/c00$i.add";
	       system($cmd);

	       open (IN,'<',"$wd/a00$i") or die "Cannot open file $wd/a00$i: $!";
	       while (defined(my $l=<IN>))
	       {
		   my @d=split(' ',$l);
		   $cmd ="cat $wd/common_motif.out | grep ^Seq | $awk -v p=$d[0] '\$5==p' | $args 2,3,5 >> $wd/c00$i.add";
		   system($cmd);
	       }
	       close IN;
	       $cmd ="cat $wd/a00$i $wd/c00$i.add | $args 1,2 > $wd/c00$i";
	       system($cmd);
	   }
	   $dunC += `wc $wd/c00$i | $args 1`; 
       }
   }
}
else   ##random split ($do_CM==0)
{
    my $sp = split_file($of,$folds,"$wd/c00",1);
    $dunC = $dun;
}
$hom_rem = $dun-$dunC;
my $topr = "##$folds subsets created using ";
if ($do_CM==2) {
	$topr .= "maximum common motif of $CMthr";
} elsif ($do_CM==1) {
    $topr .= "homology reduction with threshold $thrH";
} else {
   $topr .= "random partitions";
}
print "$topr\n##Removed $hom_rem homologous sequences\n";

## 5.2 re-add constant flanks (if removed) 
for (my $i=0; $i<$folds; $i++)
{
    my $o="$wd/c00$i"."XX";
    open (IN,'<',"$wd/c00$i") or die "Cannot open file $wd/c00$i: $!";
    open (OUT,'>',$o) or die "Cannot create file $o: $!";
    while (defined(my $l=<IN>))
    {
	if ($l =~ /(\S+)\s+(\S+)/)
	{
	    my $nseq = defined($opt_A) ? "${fL}${1}${fR}" : $1;	   
	    print OUT "$nseq\t$2\n";
	}
	else
	{
	    die "Format error at line:\n$l\n";
	}
    }
    close IN;
    close OUT;
}

## 5.3 combine subsets into training sets, stop sets and evaluation sets

#complementary sets

 for (my $n=0; $n<$folds; $n++) {
    $cmd = "cat";
    for (my $m=0; $m<$folds; $m++) {
	unless ($n==$m)
	{
	    $cmd .= " $wd/c00${m}XX";
	}
    }
    $cmd .= " > $wd/f00${n}XX";
    system($cmd);
}
if ($cval==1) {
    #training sets f$n.$m (notation: f$n.$m -> $n=stop, $m=evaluation)
    for (my $n=0; $n<$folds; $n++)
    {
	for (my $m=0; $m<$folds; $m++)
	{
	    next if $n==$m;
	    my $cat = "cat";
	    for (my $i=0; $i<$folds; $i++)
	    {
		$cat .= " $wd/c00${i}XX"  unless ($i==$n || $i==$m);	    
	    }
	    $cat .= " > $wd/f${n}.${m}XX";
	    system($cat);
	}
    }
}
print "###Training set, stopping sets and evaluation sets created\n";

##############################################################
# 6. Rescale data (defined transform on fn,m and also apply to c00n) 

if ($cval==1) {
   ##rescale stop set
   for (my $n=0; $n<$folds; $n++) {
       for (my $m=0; $m<$folds; $m++) {
	   if ($n != $m) {
	       if ($dolog==2) {
		   $cmd = "cp $wd/f${n}.${m}XX $wd/rr_f${n}.${m}XX; ";
		   $cmd .="cp $wd/c00${n}XX $wd/rr_c00${n}.${m}XX";
	       } else {
		    $cmd = "$perl -I $bin/lib $logtransf -f $wd/f${n}.${m}XX -L 0 -H 1 -s3";
		    $cmd .= " -l" if $dolog==0; #no log-transform
                    $cmd .= " -r" if $reverse_data==1;  #low values are positive examples
		    $cmd .= " $wd/c00${n}XX; ";
		    $cmd .= "mv $wd/rr_c00${n}XX $wd/rr_c00${n}.${m}XX";
		}
	       system($cmd);
	   }	 
       }
   }
}
#rescale eval set
for (my $n=0; $n<$folds; $n++)
{  
   if ($dolog==2) {
       $cmd = "cp $wd/f00${n}XX $wd/rr_f00${n}XX; ";
       $cmd .="cp $wd/c00${n}XX $wd/rr_c00${n}XX.EVAL";
   } else {
       $cmd = "$perl -I $bin/lib $logtransf -f $wd/f00${n}XX -L 0 -H 1 -s3";
       $cmd .= " -l" if $dolog==0; #no log-transform
       $cmd .= " -r" if $reverse_data==1;  #low values are positive examples
       $cmd .= " $wd/c00${n}XX; ";
       $cmd .= "mv $wd/rr_c00${n}XX $wd/rr_c00${n}XX.EVAL";
   }
   system($cmd);
}

print "####Data pre-processing complete!\n";

#########################################################################
#######################
## TRAIN networks on the range of motif lengths $minl to $maxl
for (my $lgt=$minl; $lgt<=$maxl; $lgt+=$stepl)
{
    ##################################################
    # 8. Train NN-align

    print "##### Training NN... ";

    if ($cval==1)  ##extensive cross-validation (slower)
    {
	##first create synapse files lists
		for (my $m=0; $m<$folds; $m++)
		{  
			$cmd = "rm -f $wd/${m}.synlist.lg${lgt}.txt";
			system($cmd);
			for (my $n=0; $n<$folds; $n++) {
				if ($n != $m) {
					for (my $ss=0; $ss<=$#nh; $ss++) {
					   for (my $enc=0; $enc<=$#bl_enc; $enc++) {
						   for (my $i=1; $i<=$nseeds; $i++)
						   {
								my $et= $bl_enc[$enc]==0 ? "sp" : "bl";
								my $sname="$wd/synfiles/${n}.${m}.$nh[$ss].$et.${i}.lg${lgt}.syn";
								$cmd = "echo $sname >> $wd/${m}.synlist.lg${lgt}.txt";
								system($cmd);
								push(@{$synapses{$lgt}{$m}},$sname);
							}
					   }
					}
				}
			}
		}

		## train ANNs splitting jobs with the child manager
		my $yy=1;
		for (my $m=0; $m<$folds; $m++) {
			for (my $n=0; $n<$folds; $n++) {
				if ($n != $m) {
					for (my $ss=0; $ss<=$#nh; $ss++) {
					   for (my $enc=0; $enc<=$#bl_enc; $enc++) {
						   for (my $i=1; $i<=$nseeds; $i++)
						   {
							   my $et= $bl_enc[$enc]==0 ? "sp" : "bl";
							   $cmd = "$smm_train -i $cycles -rsc -l $lgt -s $i -nh $nh[$ss] -fl $fl -blr -blf $blf -bls 50 -mpat $matdir/BLOSUM%i -syn $wd/synfiles/${n}.${m}.$nh[$ss].$et.${i}.lg${lgt}.syn -ft $wd/rr_c00${n}.${m}XX";
							   $cmd .= " -bl " if $bl_enc[$enc]==1;
							   $cmd .= " -teststop" if $dostop==1;
							   $cmd .= " -elpfr" if defined($encF);
							   $cmd .= " -eplen" if defined($encL);
							   $cmd .= " $wd/rr_f${n}.${m}XX > $wd/nnres/${n}.${m}.$nh[$ss].$et.${i}.lg${lgt}.pep.alg";
							   $cm->start($cmd);
							   print "$yy ";
							   $yy++;
						   }		    
					   }
				   }
				}
			}
		}
		##wait for all processes to finish
		$cm->wait_all_children;
    }
    else  ##faster training (performance will be evaluated on the stop sets). This is the default if no stopping is required.
	{
		for (my $m=0; $m<$folds; $m++)
		{  
			$cmd = "rm -f $wd/${m}.synlist.lg${lgt}.txt";
			system($cmd);
			for (my $ss=0; $ss<=$#nh; $ss++) {
				for (my $enc=0; $enc<=$#bl_enc; $enc++) {
					for (my $i=1; $i<=$nseeds; $i++)
					{
						my $et= $bl_enc[$enc]==0 ? "sp" : "bl";
						my $sname="$wd/synfiles/${m}.$nh[$ss].$et.${i}.lg${lgt}.syn";
						$cmd = "echo $sname >> $wd/${m}.synlist.lg${lgt}.txt";
						system($cmd);
						push(@{$synapses{$lgt}{$m}},$sname);
					}
				 }  
			 }	
		}

		## train ANNs splitting jobs with the child manager
		my $yy=1;
		for (my $m=0; $m<$folds; $m++)
		{	
			for (my $ss=0; $ss<=$#nh; $ss++) {
				for (my $enc=0; $enc<=$#bl_enc; $enc++) {
					for (my $i=1; $i<=$nseeds; $i++)
					{
						my $et= $bl_enc[$enc]==0 ? "sp" : "bl";
				 		$cmd = "$smm_train -i $cycles -rsc -l $lgt -s $i -nh $nh[$ss] -fl $fl -blr -blf $blf -bls 50 -mpat $matdir/BLOSUM%i -syn $wd/synfiles/${m}.$nh[$ss].$et.${i}.lg${lgt}.syn -ft $wd/rr_c00${m}XX.EVAL";
				 		$cmd .= " -bl" if $bl_enc[$enc]==1;
				 		$cmd .= " -teststop" if $dostop==1;
				 		$cmd .= " -elpfr" if defined($encF);
				 		$cmd .= " -eplen" if defined($encL);
				 		$cmd .= " $wd/rr_f00${m}XX > $wd/nnres/${m}.$nh[$ss].$et.${i}.lg${lgt}.pep.alg";
				 		$cm->start($cmd);
				 		print "$yy ";
						$yy++;
					}
				}
			}	
		}
		##wait for all processes to finish
		$cm->wait_all_children;
    }
    print "\n######L=$lgt - Neural network training complete\n";

#######################################################
### 10. select top $bs networks for each cross-validation step

     my @stmp = @{$synapses{$lgt}{'0'}};
     $netens = $bs > $#stmp ? $#stmp+1 : $bs; ##how many networks (per fold) in the ensemble?
     $netens = $#stmp+1 if $folds==1;  #no perf evaluation without cross-validation

     $totnetens=$netens * $folds; ##total networks in final ensemble
 
     for (my $m=0; $m<$folds; $m++)
     {	  
		 my @synbl=@{$synapses{$lgt}{$m}};
		 my %perf;
			 my $bnfile="$wd/${m}.synlist.lg${lgt}.best${netens}.txt";
			 open (BSN,'>',$bnfile) or die "Cannot create $bnfile: $!\n";
	
		 for (my $i=0; $i<=$#synbl; $i++)  ##sort networks on test-set performance (min RMSE)
		 {	 
			 my $syn=$synbl[$i];
			 my $headp= $dostop==1 ? 13 : 9;   ## synapse header is different when using stopping set
			 my $p = `head -1 $syn | $args $headp`;
			 chomp $p;
			 $perf{$syn}=$p;
		 }	
		 my $i=0;
		 foreach my $syn (sort {sortbyvalue(%perf)} keys %perf)
		 {
			 $i++;
			 last if ($i>$netens);
			 push(@{${bbsyn{$lgt}}},$syn);
			 push(@{${bbsyn_cv{$lgt}{$m}}},$syn);
				 print BSN "$syn\n";
			 $offset{$syn}=0;
		 }
		 close BSN;
     }
    ########################################################
    # 11. Play networks on evaluation sets, only using the top networks

    for (my $m=0; $m<$folds; $m++)
    {
		$cmd = "$smm_play -fl $fl -l $lgt -bls 50 -mpat $matdir/BLOSUM%i -blf $blf";
		$cmd .= " -elpfr" if defined($encF);
		$cmd .= " -eplen" if defined($encL);
	
		$cmd .=" $wd/${m}.synlist.lg${lgt}.best${netens}.txt $wd/rr_c00${m}XX.EVAL > $wd/c00$m.bl.lg${lgt}.eval.best${netens}.pred";
		system($cmd);
    }

    ######################################################
    # 12. Combine results of the $folds (5)  evaluation sets
    # results have format PEPTIDE tab TARGET tab PREDICTION tab CORE

    my $rf="$wd/c01234.bl.lg${lgt}.pred";
    $rf{$lgt}=$rf;
    open (OUT,'>',$rf) or die "Cannot create file $rf: $!";

    for (my $m=0; $m<$folds; $m++)
    {
		my $df="$wd/c00$m.bl.lg${lgt}.eval.best${netens}.pred";
		open (IN,'<',$df) or die "Cannot open file $df: $!";
	
		while (defined(my $l=<IN>))
		{
			next if ($l=~m/\#/);  ##grep -v "#"
			## Core Start Target Pred Peptide
			if ($l=~m/(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/)
			{
			print OUT "$5\t$3\t$4\t$1\n";
			push(@{$target{$lgt}},$3);
			push(@{$pred{$lgt}},$4);
			}
		}
		close IN;
    }

    close OUT;

    print "#######L=$lgt - Evaluation in $folds-fold cross validation\n";

    ############################################################
    # 14. Evaluate performance

    ##bootstrap pearson and RMSE

    my $tmp=`cat $wd/c01234.bl.lg${lgt}.pred | $args 2,3 | $xycorr -bt -- | $args 12,13`;
    ($pears{$lgt},$pearsERR{$lgt})=split(' ',$tmp);

    $tmp=`cat $wd/c01234.bl.lg${lgt}.pred | $args 2,3 | $xyerror -bt -rmse -- | $args 12,13`;
    ($rmse{$lgt},$rmseERR{$lgt})=split(' ',$tmp);

    print "########L=$lgt - RMSE (window=$lgt) = $rmse{$lgt}\n";
    print "########L=$lgt - Pearson (window=$lgt) = $pears{$lgt}\n";

    if ($rmse{$lgt}<$bestRMSE) {
		$bestRMSE=$rmse{$lgt};
		$bestl=$lgt;
    }
}

######################################
# make graph of motif length vs. performance

if ($use_R && $maxl!=$minl)
{
    open (GR,'>',"$wd/pfpoints.txt") or die "Cannot create file $wd/pfpoints.txt:  $!";
    for (my $lgt=$minl; $lgt<=$maxl; $lgt+=$stepl)
    {
		print GR "$lgt\t$rmse{$lgt}\t$rmseERR{$lgt}\t$pears{$lgt}\t$pearsERR{$lgt}\n";
    }
    close GR;

    $cmd="cat $rscripts/R-plot_pfsize.R | $R --vanilla --args $wd/pfpoints.txt $length_png > $wd/Rlog.txt";
    system($cmd);
}
############################################################
#  Make scatterplot of target vs prediction

for (my $lgt=$minl; $lgt<=$maxl; $lgt+=$stepl)
{
   $spearman{$lgt}='NA';  #only with R
   my $scat_pdf;
   if ($use_R) {
      my $scat_pdf="$wd/${prefix}.scatterplot.lg${lgt}.pdf";
      my ($pears,$spear) = scatterplot($rf{$lgt},$dunC,$lgt,$scat_pdf);
      $spearman{$lgt}=sprintf("%.4f",$spear);
   }
   ##format correlation

   $pearson{$lgt}=sprintf("%.4f",$pears{$lgt});
   $rmseB{$lgt}=sprintf("%.6f",$rmse{$lgt});

=pod
      print "### MOTIF LENGTH: $lgt\n";
      print "### RMSE: $rmseB{$lgt}\n";
      print "### Pearson: $pearson{$lgt}\n";
      print "### Spearman: $spearman{$lgt}\n" if $use_R;
=cut
}
#################################################################################
##### NO cross-validation section. simply train on all data
NOCV: ###all the evaluation above must be skipped if no cross-validation is made

if ($folds==1)
{
    for (my $lgt=$minl; $lgt<=$maxl; $lgt+=$stepl)
    {
	$cmd = "rm -f $wd/0.synlist.lg${lgt}.txt";
	system($cmd);
	for (my $ss=0; $ss<=$#nh; $ss++) {
	   for (my $enc=0; $enc<=$#bl_enc; $enc++) {
	       for (my $i=1; $i<=$nseeds; $i++)
	       {
		   my $et= $bl_enc[$enc]==0 ? "sp" : "bl";
		   my $sname="$wd/synfiles/0.$nh[$ss].$et.${i}.lg${lgt}.syn";
		   $cmd = "echo $sname >> $wd/0.synlist.lg${lgt}.txt";
		   system($cmd);
		   push(@{$synapses{$lgt}{'0'}},$sname);
		   push(@{$bbsyn{$lgt}},$sname);
		   push(@{$bbsyn_cv{$lgt}{'0'}},$sname);
	       }
	  }	
       }
       print "#### Training NN on all data, no cross-validation...";
       ## train ANNs splitting jobs with the child manager	
       for (my $ss=0; $ss<=$#nh; $ss++)
       {
		   for (my $enc=0; $enc<=$#bl_enc; $enc++)
		   {
			  for (my $i=1; $i<=$nseeds; $i++)
			  {
				   my $et= $bl_enc[$enc]==0 ? "sp" : "bl";
				   $cmd = "$smm_train -i $cycles -rsc -l $lgt -s $i -nh $nh[$ss] -fl $fl -blr -blf $blf -bls 50 -mpat $matdir/BLOSUM%i -syn $wd/synfiles/0.$nh[$ss].$et.${i}.lg${lgt}.syn -ft $trainfile";
				   $cmd .= " -bl" if $bl_enc[$enc]==1;
				   $cmd .= " -elpfr" if defined($encF);
				   $cmd .= " -eplen" if defined($encL);
				   $cmd .= " $trainfile > $wd/nnres/0.$nh[$ss].$et.${i}.lg${lgt}.pep.alg";
				   $cm->start($cmd);
			  }
		   }	
       }
       ##wait for all processes to finish
       $cm->wait_all_children;

       print "\n######L=$bestl - Neural network training (without cross-validation) complete\n";
    }
     my @stmp = @{$synapses{$minl}{'0'}};
     $netens = $#stmp+1;
     $totnetens=$netens;

}

### OFFSET CORRECTION
###############################################################

my $natpepf="$matdir/rand.30mers.txt";
my $natpepf_short="$matdir/rand.30mers.short.txt";   
my $parall=10;
my $p=split_file($natpepf,$parall,"$wd/natpep",0);
my (%sorted_pred);
my %peps4logo;

for (my $lgt=$minl; $lgt<=$maxl; $lgt+=$stepl)
{
    ### run networks on random N-mers
    my $lgcut=$lgt+2*$fl;
    my $rand_mers="$wd/rand${lgcut}.txt";
    $cmd = "cut -c1-$lgcut $natpepf > $rand_mers";
    system($cmd);

    ## graphical parameters 
    my $width= $lgt < 1.5*6 ? 1.5*6 : 1.5*$lgt;

    ##############################################
    ## ALIGN NETWORKS TO A COMMON REGISTER

    my $peps4logo="$wd/peps4logo.lg${lgt}.txt";
    $peps4logo{$lgt}=$peps4logo;
    my $matlist="$wd/${prefix}.lg${lgt}.matlist";
    my $maxoffset=$lgt;
    my %kld;
    my @selnets=@{$bbsyn{$lgt}};  #top networks

    unless (defined($opt_O))
    {
    	 print "### Running offset correction now for L=$lgt...\n";
    
		 ### run networks on the random N-mers -> make PSSM for each network
		 for (my $i=0; $i<=$#selnets; $i++)
		 {	 
			 my $syn_name=$selnets[$i];
			 $cmd = "$smm_play -fl $fl -blf $blf -l $lgt -bls 50 -mpat $matdir/BLOSUM%i";
			 $cmd .= " -elpfr" if defined($encF);
			 $cmd .= " -eplen" if defined($encL);
			 $cmd .= " -s $syn_name $natpepf > $wd/rand.net${i}.pred";
			 $cm->start($cmd);
		 }
	
		 $cm->wait_all_children();
	
		 open (MATLIST,'>',$matlist) or die "Cannot create file $matlist: $!";
		 for (my $i=0; $i<=$#selnets; $i++)
		 {	
			 my $syn_name=$selnets[$i];
			 $cmd = "cat $wd/rand.net${i}.pred | grep -v '#' | sort -nrk4 | head -1000 | cut -f1 -d ' ' | $pep2mat -swt 1 -blf $blf -freq -- > $wd/net${i}.mat";
			 $cm->start($cmd);
			 print MATLIST "$wd/net${i}.mat $syn_name\n";
		 }
		 close MATLIST;
		 $cm->wait_all_children();
	
		  $cmd = "$pssm_align -ts $ts_pssm -i $i_pssm -maxoff $maxoffset -nt 20 -blf $blf "; 
		  $cmd .= "-p1a " if $p1a==1; ##hydrophobic AA preference at P1
		  $cmd .= "$matlist | grep -v '#' | $args 2,3 > $wd/synlist.lg${lgt}.offset ";
		  system($cmd);
	
		  open (OFF,'<',"$wd/synlist.lg${lgt}.offset") or die "Cannot open synlist file: $!";
		  while(defined(my $l=<OFF>))
		  {
			  if ($l=~ m/(\S+)\s+(\S+)/) {
			  $offset{$1}=$2;
			  }
		  }
		  close OFF;
     }

    #### modify synapse files adding the calculated OFFSET to each network    
    foreach (my $m=0; $m<$folds; $m++)
    {
	my @nets=@{$bbsyn_cv{$lgt}{$m}};    
	my $synfile="$wd/${m}.synlist.lg${lgt}.txt";
	open (SL,'>',$synfile) or die "Cannot open $synfile: $!";

	for (my $i=0; $i<=$#nets; $i++)
	{
	    my $off= defined($offset{$nets[$i]}) ? $offset{$nets[$i]} : 0;
	    my $newname=$nets[$i] . ".off";
	    print SL "$newname\t$off\n";
	    open (OSYN,'<',$nets[$i]) or die "Cannot open file $nets[$i]: $!";
	    open (NSYN,'>',$newname) or die "Cannot create file $newname: $!";
	    while (defined(my $l=<OSYN>)) {
		chomp $l;
		$l .= " OFFSET $off" if $l=~ m/TESTRUNID/;
		print NSYN "$l\n";
	    }
	    close NSYN;
	    close OSYN;
	    $bbsyn_cv{$lgt}{$m}->[$i]=$newname;
	    $offset{$newname}=$off;
	}
	close SL;
    }

    #################################################################
    ### 13. Save synapse files, and all options, to a single file. This file should
    ## contain all information to allow running the trained method on new data

    $modfile{$lgt} = "${prefix}.lg${lgt}.model.txt";
    open (MOD,'>',"$wd/$modfile{$lgt}") or die "Died. Cannot create $wd/$modfile{$lgt}: $!\n";

    my $synheader="#-ID=$prefix -m=$lgt -j=$fl -n=$nh -F=$encF -L=$encL -B=$Bo -C=$cycles -s=$nseeds -l=$dolog -y=$dostop -e=$cval -a=$folds -b=$netens -pe=$pearson{$lgt} -sp=$spearman{$lgt} -rmse=$rmseB{$lgt}\n"; #network info
    print MOD $synheader;

    foreach (my $m=0; $m<$folds; $m++)
    {
	my @nets=@{$bbsyn_cv{$lgt}{$m}};  #only selected networks   
	for (my $i=0; $i<=$#nets; $i++)
	{
	    my $syn = $nets[$i];
	    open (IN,'<',$syn) or die "Died. Cannot open file $syn: $!";
	    my $l;
	    print MOD $l while defined($l=<IN>);
	    close IN;

	}
    }
    close MOD;

    ##### LOGOS #################
    ###
    ###SPLIT FILE FOR PARALLLEL RUN, and predict best scoring peptides for logo generation
  
    for (my $i=0; $i<$parall; $i++)
    {
		$cmd = "$smm_play -fl $fl -blf $blf -l $lgt -bls 50 -mpat $matdir/BLOSUM%i";
		$cmd .= " -elpfr" if defined($encF);
		$cmd .= " -eplen" if defined($encL);
		$cmd .= defined($opt_O) ? " -s $wd/$modfile{$lgt}" : " -offset $wd/synlist.lg${lgt}.offset"; #do correction
		$cmd .= " $wd/natpep.split.${i}.txt > $wd/natpep.split.${i}.pred";
		$cm->start($cmd);
    }
    $cm->wait_all_children;

    ## sort cores of the best scoring peptides
    $mg="sort -mnrk2";
    for (my $i=0; $i<$parall; $i++)
    {
		$cmd = "cat $wd/natpep.split.${i}.pred | grep -v '#' | sort -nrk4 | cut -f1,4 -d ' ' > $wd/nat.${i}.sorted.txt";
		system($cmd);
		$mg .= " $wd/nat.${i}.sorted.txt";
    }

    $mg .= " | head -1000 | cut -f1 -d ' ' > ${peps4logo}.X";
    system($mg);

    ### substitute Xs with random amino acids (weblogo does not support X)
    open (PEP,'>', $peps4logo) or die "Cannot create file $peps4logo: $!";
    open (PEPX,'<', "${peps4logo}.X") or die "Cannot open file ${peps4logo}.X: $!";
    {
	while (defined(my $l=<PEPX>)) {
	    while ($l =~ /X/) {  
		my $aa = substr($AA,20*rand(),1);
		$l =~ s/X/$aa/;
	    }
	    print PEP $l;
	}
    }
    close PEPX;
    close PEP;

    #### MAKE COMBINED LOGO

	if ($use_weblogo) {
		my $h=18;
		my $w=$width;
		until (-s "$wd/${prefix}.lg${lgt}.logo.png" || $h>50) {
			$cmd="$seqlogo -f $peps4logo -F PNG -B $logoBits -p -c -n -Y -t \"$prefix\" -w $w -h $h -o $wd/${prefix}.lg${lgt}.logo > /dev/null 2>&1";
			system($cmd);
			$h++;
			$w++;
		}
	}
	
    ### log-odds matrix representation of the motif

    $cmd = "cat $peps4logo | $pep2mat -swt 1 -blf $blf -- | grep -v '#' | grep -v 'Last' > $wd/${prefix}.lg${lgt}.lo.mat";
    system($cmd);
    
    $cmd = "cat $peps4logo | $pep2mat -swt 1 -blf $blf -freq -- | grep -v '#' | grep -v 'Last' > $wd/${prefix}.lg${lgt}.freq.mat";
    system($cmd);

    ###################################
    ###make ALL LOGOS of networks in the final ensemble
    if ($opt_Q && $use_weblogo)
    {
	   my @lsyn = @{$bbsyn{$lgt}};
       for (my $cn=0; $cn<=$#lsyn; $cn++)
       {    
			$cmd = "$smm_play -fl $fl -l $lgt -bls 50 -mpat $matdir/BLOSUM%i -blf $blf";
			$cmd .= " -elpfr" if defined($encF);
			$cmd .= " -eplen" if defined($encL);
	
			$cmd .=" -s $lsyn[$cn] $natpepf > $wd/natpep.net.${cn}.pred; ";
	
			$cmd .= "cat $wd/natpep.net.${cn}.pred | grep -v '#' | sort -nrk4 | head -1000 | cut -f1 -d ' ' > $wd/natpep.topscoring.net.${cn}.txt";
	
			$cm->start($cmd);
		}
		$cm->wait_all_children();

		for (my $cn=0; $cn<=$#lsyn; $cn++)
		{
			my $cnet=$lsyn[$cn];
			my $h=18;
			my $w=$width;
			until (-s "$wd/${prefix}.lg${lgt}.net${cn}.logo.png" || $h>50) {
			   $cmd="$seqlogo -f $wd/natpep.topscoring.net.${cn}.txt -F PNG -B $logoBits -p -c -n -Y -t \"$prefix (net $cn - off $offset{$cnet})\" -w $w -h $h -o $wd/${prefix}.lg${lgt}.net${cn}.logo > /dev/null 2>&1";
			   system($cmd);
			   $h++;
			   $w++;
			}
		}
    }	    

    ########################################################
     # Play networks once more in cross-validated evaluation sets, with offset correction

    my (%tg,%pr,%co);

    if ($folds>1)
    {
		for (my $m=0; $m<$folds; $m++)
		{
			$cmd = "$smm_play -fl $fl -l $lgt -bls 50 -mpat $matdir/BLOSUM%i -blf $blf";
			$cmd .= " -elpfr" if defined($encF);
			$cmd .= " -eplen" if defined($encL);
	
			$cmd .=" -offset $wd/${m}.synlist.lg${lgt}.txt $wd/rr_c00${m}XX.EVAL > $wd/c00$m.bl.lg${lgt}.eval.off.pred";
			system($cmd);
		}
	
		######################################################
		# Combine results of the $folds evaluation sets
		# results have format PEPTIDE tab TARGET tab PREDICTION tab CORE
	
		my $rf="$wd/c01234.bl.lg${lgt}.off.pred";
		$rf{$lgt}=$rf;
		open (OUT,'>',$rf) or die "Cannot create file $rf: $!";
	
		for (my $m=0; $m<$folds; $m++)
		{
			my $df="$wd/c00$m.bl.lg${lgt}.eval.off.pred";
			open (IN,'<',$df) or die "Cannot open file $df: $!";
	
			while (defined(my $l=<IN>))
			{
				next if ($l=~m/\#/);  ##grep -v "#"
				## Core Start Target Pred Peptide
				if ($l=~m/(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/)
				{
					print OUT "$5\t$3\t$4\t$1\n";
					$co{$5}=$1;
					$pr{$5}=$4;
					$tg{$5}=$3;
				}
			}
			close IN;
		}
		close OUT;
		print "#######Completed re-alignment with offset correction\n";
    }

    ######################################
    # sort on prediction value for best length, or show in original order
    if ($folds>1)
    {
       $sorted_pred{$lgt}="${prefix}.cores.lg${lgt}.sorted.txt";
       if (defined($opt_S))
       {
		   $cmd = "cat $rf{$lgt} | sort -nrk3 > $wd/$sorted_pred{$lgt}";
		   system($cmd);
       }
       else {  #original order
		   open (PRD,'>',"$wd/$sorted_pred{$lgt}") or die "Cannot create file $wd/$sorted_pred{$lgt}: $!";
		   for (my $i=0; $i<=$#allsequences; $i++)
		   {
			   my $cseq = $allsequences[$i];
			   unless (defined($opt_A)) {
				   if (length($fR)>0) {
					   $cseq=substr($allsequences[$i],length($fL),-length($fR));
				   }
				   else {
					   $cseq=substr($allsequences[$i],length($fL));
				   }
			   }
			   if (defined($co{$cseq})) { #skip removed homologous sequences
			      print PRD "$cseq\t$tg{$cseq}\t$pr{$cseq}\t$co{$cseq}\n";
			   }
		   }
	   	   close PRD;
       }
    }
}
################################################################
#### 15. Copy results files from working directory to results directory

MODEL:

if ($opt_M)
{
    $modfile{$bestl}="${modID}.model.txt";
    $cmd="cp $opt_M $wd/$modfile{$bestl}; cp $opt_M $resdir/$modfile{$bestl}";
    system($cmd);
    $totnetens=$netens * $folds;
    print "## Saved NN model to file $resdir/$modfile{$bestl}\n";
}
else
{
   for (my $lgt=$minl; $lgt<=$maxl; $lgt+=$stepl)
   {
      if ($folds>1)
      {
      	 $cmd = "echo 'Sequence\tObserved\tPrediction\tCore' > $resdir/$sorted_pred{$lgt}; ";
      	 $cmd .= "cat $wd/$sorted_pred{$lgt} >> $resdir/$sorted_pred{$lgt}";
         system($cmd);
         print "## Saved alignment and predictions to $resdir/$sorted_pred{$lgt}\n";
                  
         if ($use_R) {
	        $cmd = "cp $wd/${prefix}.scatterplot.lg${lgt}.pdf $resdir";     
	     	system($cmd);
	     	print "## Saved prediction scatterplot to ${prefix}.scatterplot.lg${lgt}.pdf\n";
	     }	     
      }

      $cmd="cp $wd/${prefix}.lg${lgt}.model.txt $resdir; ";
      $cmd .= "cp $wd/${prefix}.lg${lgt}.logo.png $resdir; " if (-e "$wd/${prefix}.lg${lgt}.logo.png");
      system($cmd);
      print "## Saved NN model to file $resdir/${prefix}.lg${lgt}.model.txt\n";
      print "## Saved combined Sequence Logo $resdir/${prefix}.lg${lgt}.logo.png\n" if (-e "$resdir/${prefix}.lg${lgt}.logo.png");

      $cmd = "cp $wd/${prefix}.lg${lgt}.lo.mat $resdir; cp $wd/${prefix}.lg${lgt}.freq.mat $resdir; ";          
      system($cmd);
      print "## Saved final frequency matrix to $resdir/${prefix}.lg${lgt}.freq.mat\n";
      print "## Saved final log-odds matrix to $resdir/${prefix}.lg${lgt}.lo.mat\n";

      for (my $i=0; $i<$totnetens; $i++) {
		  if (-s "$wd/${prefix}.lg${lgt}.net${i}.logo.png") {
			  $cmd = "cp $wd/${prefix}.lg${lgt}.net${i}.logo.png $resdir";
			  system($cmd);
			  print "## Saved individual network Sequence Logo $resdir/${prefix}.lg${lgt}.net${i}.logo.png\n"
		  }
      }
   }
   if ($use_R) {
       $cmd ="cp $outf $resdir";
       system($cmd);  
       print "## Saved data distribution to $resdir/$prefix.data.distrib.pdf\n";
   }
   if (-e $length_png) {
      $cmd ="cp $length_png $resdir" ;
      system($cmd);
      print "## Saved plot of core lengths to $resdir/${prefix}.lengthVSperf.png\n";
   }
}

print "######COPIED RESULTS FILES TO FOLDER $resdir\n";

############
################################################################
##### 17. Run networks on the evaluation file (if provided)

if (defined($opt_x))
{   
   print "# Reading evaluation file.\n";
   my $evalsetfile="$wd/evalset.txt";
   open (EV,'>',$evalsetfile) or die "Cannot create file $evalsetfile: $!\n";   

   my $evscat_pdf="$wd/${prefix}.eval.scatterplot.pdf";
   my (%fragments,%core);
   my (@evtarget,@evpred);
   my $evpoints=0;
   my $fastaentries=0;
   my ($evpears,$evp2,$evspear,$evRMSE)=(0,0,'NA',0);
   my $fasta=0;
   my $fsac="nA!";
   my $cmd;
   open (IN,'<',$opt_x) or die "Cannot open file $opt_x: $!\n";
   ### for FASTA format, the first line of the file must start with >   
   if (defined(my $first=<IN>)) 
   {
       if ($first=~ m/^>(.*)/)
       {
		   $fasta=1;
		   $fsac=$1;
		   $fastaentries++;
       }
       elsif ($first=~/^(\S+)/)
       {
		   my $seq=$1;
		   my $val=0;
		   $fasta=0;
	   	   if ($first=~/^(\S+)\s+(\S+)/)  ##peptides with value
		   {
			   if (isAnumber($2))  ##if not numeric, assumes that all second fields will be comments
			   {
				  $val=$2;
				  $fasta=-1;  
			   }
		   }
	   	   $seq =~ tr/a-z/A-Z/;	
          
           if ($seq=~m/([^ACDEFGHIKLMNPQRSTVWYX])/) {
	           print "Wrong format in evaluation file (unknown amino acid): $1 at line:\n$first\n";
	           exit;
	   	   }
	       print EV "$seq\t$val\n" unless length($seq)<$bestl;    
       }
   	   else {
	      print "Wrong format in evaluation file at line:\n$first\n";
	      exit;
       }
   }
   ## two different formats accepted: FASTA file (create peptides from file), or PEPTIDE LIST
   if ($fasta==1)
   {
   	    print "# FASTA file!\n";
        my $seq="";
        my $pep="";
        my $nmer=$bestl+2*$fl;  ##include also possible flanks
		my $allX = 'X' x $nmer;
        while (defined(my $l=<IN>))
		{
	  		my @words = split ' ',$l;
	  		next unless defined($words[0]);
	  		$l = $words[0];

	  		if ($l=~m/>(.*)/)
			{
				 for (my $i=0; $i<=length($seq)-$nmer; $i++)
				 {
				 $pep = substr($seq,$i,$nmer);
				 next if $pep eq $allX;
				 print EV "$pep\t0\n" unless exists($fragments{$pep});  ##only print once duplicate peptides (but keep track of multiple entries in the hash of arrays) 	       
				 push(@{$fragments{$pep}}, $fsac);		 
				 }
				 $seq = "";
				 $fsac=$1;
				 $fsac="nA!" if length($fsac)==0;
				 $fastaentries++;
			}
			else
			{
			    $l =~ tr/a-z/A-Z/;
			    $seq .= $l;
			}
       }
  ###last sequence
       for (my $i=0; $i<=length($seq)-$nmer; $i++)
       {
		  $pep = substr($seq,$i,$nmer);
		  next if ($pep=~m/([^ACDEFGHIKLMNPQRSTVWYX])/);  ##silently skip bad characters (e.g. non-standard amino acids#
		  print EV "$pep\t0\n" unless exists($fragments{$pep});
		  push(@{$fragments{$pep}}, $fsac);
       }
   }
   elsif ($fasta==0)  ##peptide list format, without associated values
   {       
   	   print "# Peptide format!\n";
       while (defined(my $l=<IN>))
       {
		   next if length($l)<$bestl; ##peptide too short, or empty line
		   if ($l=~/^(\S+)/)
		   {
			   my $seq=$1;
			   $seq =~ tr/a-z/A-Z/;
			   if ($seq=~m/([^ACDEFGHIKLMNPQRSTVWYX])/) {
			      print "Wrong format in evaluation file for entry >$fsac (unknown amino acid): $1\n";
			      exit;
			   }
			   print EV "$seq\t0\n";
		   }
		   else
		   {
			   print "Wrong format in evaluation file, line:\n$l\n";
			   exit;
		   }
       }    
   }
   else  ##peptide list format WITH associated values
   {      
       print "# Peptide format with associated values!\n";
       while (defined(my $l=<IN>))
       {
		   next if length($l)<$bestl; ##peptide too short, or empty line
		   if ($l=~/^(\S+)\s+(\S+)/)
		   {
			   my $seq=$1;
			   my $val=$2;
			   $seq =~ tr/a-z/A-Z/;
			   if ($seq=~m/([^ACDEFGHIKLMNPQRSTVWYX])/) {
			   print "Wrong format in evaluation file (unknown amino acid): $1\n";
			  exit;
			   }
			   unless (isAnumber($val)) {
				print "Wrong format in evaluation file (not a number): $val\nat line:\n$l\n";
			   exit;   
			   }	       
			   print EV "$seq\t$val\n";
		   }
		   else
		   {
			   print "Wrong format in evaluation file, line:\n$l\n";
			   exit;
		   }
       }

       my $evtmp = $evalsetfile;
       my ($dir,$fn) = &dir_and_fname($evtmp);      
       $evalsetfile =  $dir . "/rr_" . $fn;

       if ($dolog==2) {  #do nothing
               $cmd = "cp $evtmp $evalsetfile";
               system($cmd);
       } else {
	      $cmd = "$perl -I $bin/lib $logtransf -f $evtmp -L 0 -H 1 -s3";
	      $cmd .= " -l" if $dolog==0;  ##just linear rescale
	      $cmd .= " -r" if $reverse_data==1;  #low values are positive examples
       }
       system($cmd);
   }

   close IN;
   close EV;
   
## split evaluation file into 10 subfiles

   my $sets=split_file($evalsetfile,10,"$wd/eval",0);

	print "# Running networks on evaluation file now...\n";

 ##and run the networks in parallel
    for (my $i=0; $i<$sets; $i++)
    {
       $cmd = "$smm_play -fl $fl -l $bestl -bls 50 -mpat $matdir/BLOSUM%i -blf $blf";
       $cmd .= " -elpfr" if defined($encF);
       $cmd .= " -eplen" if defined($encL);
       $cmd .=" -offset -s $wd/$modfile{$bestl} $wd/eval.split.${i}.txt > $wd/eval.split.${i}.pred";
       $cm->start($cmd);
    }
    $cm->wait_all_children;

    my $evalAll = "$wd/eval.all.txt";
    open (ALL,'>',$evalAll) or die "Cannot create file $evalAll: $!";
   
    print "# Sorting results...\n";
    
    ## sort best scoring peptides in the evaluation set
    $mg= defined($opt_S) ? "sort -mnrk3" : "cat";
    for (my $i=0; $i<$sets; $i++)
    {	
		$mg .= " $wd/ev.${i}.sorted.txt";
		my $ff = "$wd/eval.split.${i}.pred";
		my @lns;
		open (F,'<',$ff) or die "Cannot open file $ff: $!\n";
		while (defined(my $l=<F>)) {
			next if $l=~'#';
			if ($l=~/(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/)
			{
			   if ($4>$Xthr)    ##prediction bigger than threshold
			   {    
			      if ($fasta==1)
				  {
					   die "ERROR at line $l" unless defined($fragments{$5});
					   my @fid=@{$fragments{$5}};	   
					   my $cc=$1;
		
					   my $fas=$fid[0];
					   if ($#fid>0) {
					     for (1..$#fid) {
						      $fas .= "/$fid[$_]";
				         }            
				       }
					   unless(exists($core{"$cc-$fid[0]"}))  ##avoid repeating in the output same cores but with shifted flanks (just take the highest, already sorted)
					   {		
					      push (@lns, "$1 $5 $4 $fas");
					      $core{"$cc-$fid[0]"}=1;
					   } 
				   } 
				   else {
				      push (@lns, "$5 $3 $4 $1");
				   }
			    }	
			    print ALL "$5\t$3\t$4\t$1\n";
			    $evpoints++;
				if ($fasta==-1) {
				   push(@evtarget,$3);
				   push(@evpred,$4);
				}	     
			}
        }
	    close F;
	    @sorted=();
	    if (defined($opt_S)) {
	        @sorted = sort {   ##sort by predicted value
	          (split ' ', $b)[2] <=>
	          (split ' ', $a)[2]
	        } @lns;
        }
	    else {
	       @sorted = @lns;
	    }
	    open (ST,'>',"$wd/ev.${i}.sorted.txt") or die "Cannot create file $wd/ev.${i}.sorted.txt: $!\n";

		for (0..$#sorted) {
			my $ltab = $sorted[$_];
			$ltab =~ s/ /\t/g; #tab-separated
			print ST "$ltab\n";
		}
	    close ST;
    }
    close ALL;

    $mg .= " > $wd/evalset.sorted.txt";
    system($mg);

    if ($fasta==-1 && $evpoints>=10)
    {
        my $rmse=0;
        # RMSE on evaluationset
		for (my $i=0; $i<=$#evtarget; $i++)
		{
			$rmse += ($evtarget[$i]-$evpred[$i])**2;
		} 
		my $sq = $rmse/$evpoints;  
		$rmse = sqrt($sq);
	
		## get Pearson correlation
	
		my $ep=`cat $evalAll | $args 2,3 | $xycorr -bt -- | $args 12`;
		
		if ($use_R) {
		   my ($ep2,$esp) = scatterplot($evalAll,$evpoints,$bestl,$evscat_pdf);
		   $evspear=sprintf("%.4f",$esp);
		}	
		$evpears=sprintf("%.4f",$ep);	
		$evRMSE=sprintf("%.6f",$rmse);
    }

   ## copy results   

    my $header= $fasta==1 ? "Core\tContext\tScore\tSequence" :  "Peptide\tObserved\tScore\tCore";   
    
     $cmd = "echo \"$header\" > $resdir/${prefix}.evalset.txt; ";
     $cmd .= "cat $wd/evalset.sorted.txt >> $resdir/${prefix}.evalset.txt;";   
     $cmd.= " cp $evscat_pdf $resdir" if ($fasta==-1 && $evpoints>=10 && $use_R);
     system($cmd);

	print "### Uploaded $evpoints peptides";
	print " from $fastaentries FASTA entries" if $fasta==1;
	
	print "\n# Predictions saved in $resdir/${prefix}.evalset.txt\n";
     
     if ($fasta==-1 && $evpoints>=10)
     {
	     print "## RMSE = $evRMSE\n";
	     print "## Pearson correlation = $evpears\n";
	     print "## Spearman correlation = $evspear\n" if ($use_R);	
     }     
}

#### DELETE ALL from working directory
$cmd = "rm -r $wd";
system($cmd) unless defined($opt_X);

#################################################
### SUBROUTINES

sub usage{
  print("$0\n");
  print(".\n");
  print("\n");
  print("Usage: $0 -f file [-r resdir] [-c proc] [-m length] [-j flanks] [more options]\n");
  print("Command line options:\n");
  print("  f: upload training set (peptide TAB signal)\n");
  print("  Y: positives have high values (0) or low values (1) (def:0)\n");
  print("  P: prefix for results files\n");
  print("  M: upload a trained model\n");
  print("  l: linear[0] log-transform[1] no rescaling[2] (def:0)\n"); 
  print("  a: number of folds for cross-validation (def:5)\n");
  print("  H: method to create subsets: random [0], homology [1], common motif [2]\n");
  print("  t: threshold for homology clustering (def:0.8)\n");
  print("  I: max overlap length for common motif (def:5)\n");
  print("  E: remove homologous sequences from dataset (switch)\n");
  print("  A: preserve repeated flanks in original data (switch)\n");
  print("  e: do exaustive cross-validation [1] (default [0] fast evaluation)\n");
  print("  y: stop training on best test set performance (switch)\n"); 
  print("  d: directory with executables (./bin)\n");
  print("  r: directory where results are saved\n");
  print("  R: path for R (graphics)\n");
  print("  W: path for Weblogo (sequence logos)\n");
  print("  m: range motif length in format [length] or [min]-[max]/[step] (def:9)\n");
  print("  C: number of training cycles (def: 500)\n");
  print("  n: number of hidden neurons, can also be supplied as comma separated list (def:3)\n");
  print("  j: PFR to be used in neural network training (def:0)\n");
  print("  L: encode peptide length (switch)\n");
  print("  F: encode PFR length (switch)\n");
  print("  B: encoding for NN: Sparse[0] Blosum[1] Both[2](def: 1)\n");
  print("  c: number of parallel processes for NN training (def: 10)\n");
  print("  s: number of seeds for each network architecture (def: 10)\n");
  print("  b: number of networks for the final network ensemble (def: 20)\n");
  print("  S: sort results by prediction value (switch)\n");
  print("  Q: make logos of all networks in the final ensemble (switch)\n");
  print("  U: number of bits for sequence logo (def:4)\n");
  print("  O: DO NOT re-align networks with offset (switch)\n");
  print("  u: start temperature for PSSM align (def:0.10)\n");
  print("  Z: number of iterations per temperature step (def:2000)\n");
  print("  p: preference for hydrophobic AAs at P1 (switch)\n");
  print("  x: peptide file for external evaluation\n");
  print("  T: threshold for predictions on evaluation set (def: 0)\n");
  print("  X: do NOT delete temp files, for debugging (switch)\n");
  print("  h: print this message\n");
  exit;
}

## check format of a number
sub isAnumber {
    my $test = shift;

    eval {
        local $SIG{__WARN__} = sub {die $_[0]};
        $test += 0;
    };
    if ($@) {
	return 0;}
    else {
	return 1;} 
}

## check for integer
sub isInt {
    my $test=shift;
    
    if ($test =~ m/^\d+$/ && $test>0) {
	return 1; }
    else {
	return 0; }
}

## break path into directory and file name
sub dir_and_fname {

   my $path = $_[0];
   my $dir;
   my $fn;

   if ($path =~ m/(.+)\/(.+)$/) {
       $dir=$1;
       $fn=$2;
   }
   elsif ($path =~ m/(.+)/){
       $dir=".";
       $fn=$1;
   }
   else{
       die ("Invalid path\n");
   }
   return ($dir,$fn);
}

## sort hash by hash value descending
sub sortbyvalue {
   my (%hash) = @_;
   $hash{$a} <=> $hash{$b};
}

######################################
## make correlation plot, and calculate pearson and spearman correlations

sub scatterplot {

    my $rfile=$_[0];
    my $bpt=$_[1];
    my $lgt=$_[2];
    my $scPDF=$_[3];

    ##number of xbins
    my $xbins=0;

    if ($bpt>500)  ##.. a number between 50 and 150 (uses a double sigmoid)
    {
	$xbins=50*(1/(1+exp(-0.001*($bpt-2000)))+1/(1+exp(-0.001*($bpt-8000))))+50;
    }

    $cmd = "cat $rscripts/scatterplot.tgvspred.R | $R --vanilla --args $rfile $scPDF \"Corr. scatterplot - motif length=$lgt ($bpt datapoints)\" $xbins > $wd/Rlog.txt";
    system($cmd);

    ## fetch Pearson and Spearman correlation from Rlog.txt file

    open (RLOG,'<',"$wd/Rlog.txt") or die "Cannot find $wd/Rlog.txt file\n";
    my $Rflag=0;
    my ($nextp,$nexts)=(0,0);
    my ($pears,$spear)=(0,0);

    while (defined(my $l=<RLOG>))
    {
       chomp $l;
       if ($nextp==1) {
	   $pears=$l;
	   $nextp=0;
       }
       elsif ($nexts==1) {
	   $spear=$l;
	   $nexts=0;
       }
       elsif ($Rflag==0)
       {
	   $Rflag=1 if $l=~ m/cor\.test/;
       }
       else
       {
	   $nextp=1 if $l=~ m/\s+cor\s+/;
	   $nexts=1 if ($l!~ m/alternative/ and $l=~m/rho\s+/);       
       }
    }
    close RLOG;
    return ($pears,$spear);
}

############################################################
## split file into N ones with same number of lines

sub split_file {

    my $infile=$_[0];
    my $splits=$_[1];
    my $pfix=$_[2];
    my $force=0;
    $force=1 if (defined($_[3]) and $_[3] eq "1");
    my $subl=0;
    $subl=$_[4] if defined($_[4]);

    ##read IN and count lines. make substrings if specified
    my $counter=0;
    my @lines;
    open (IN,'<',$infile) or die "Cannot open $infile\n";
    while (defined(my $l=<IN>))
    {
	chomp $l;
	my $ss;
	my $val="";
	if ($l=~m/^(\S+)\s+(\S+)/) {
	    $ss=$1;
	    $val=$2;
        }
	else {
	   $ss=$l;
        }
	$ss = substr($ss,0,$subl) if $subl>0;
	
	$ss .= "\t$val" if length($val)>0; ##re-append value
	push (@lines,$ss);
	$counter++;
    }
    close IN;

    $splits=1 if ($counter<1000 and $force==0);  ##just make one file

    ## calculate split points
    my $div=int((($counter)/$splits)+0.5);
    my $thr=0;
    my @thr;

    for (1..$splits-1)
    {
	$thr+=$div;
	push(@thr,$thr);
    }
    push(@thr,$counter+2);

    ## create files
    my $i=0;
    my $g=0;

    my $fname = $force==1 ? "${pfix}$g" : "${pfix}.split.${g}.txt";

    open (OUT,'>',$fname) or die "Cannot create file $fname: $!\n";

    for (my $i=0; $i<=$#lines; $i++)
    {
	if ($i>=$thr[$g])
	{
	    $g++;
	    close OUT;
	    $fname = $force==1 ? "${pfix}$g" : "${pfix}.split.${g}.txt";
	    open (OUT,'>',$fname) or die "Cannot create file $fname: $!\n";
	}
	print OUT "$lines[$i]\n";
    }
    close OUT;
    return $splits;
}
