# ***********************************************************************
# If you experience any problem with this pgm,
# please do not hesitate to contact either
# Lawrence Joseph 934-1934 ext 44713, lawrence.joseph@mcgill.ca
# or
# Patrick Blisle 934-1934 ext 44714, belisle@epimgh.mcgill.ca
#
# http://www.medicine.mcgill.ca/epidemiology/Joseph
#
# ***********************************************************************

$OUTPUT_WIDTH = 80;
$NLPP         = 57; # Number of lines per page (relevant when output has extension .doc)

$MATRIXCOLSEP  = 2;
$MATRIXLMARGIN = 1;

$SAMECLUSTERCHAR = "#";

# --- Do not edit below this line ----------------------------------------------

use Win32;
$CWD = Win32::GetCwd();

$SRC     = ".";
$TMP     = "../tmp";
$FATAL   = "$TMP/fatal.txt";
$SUCCESS = "$TMP/PerlSuccess.txt";

$PROP_CI = "$SRC/prop-ci.txt";

$VB_OUT  = "$TMP/sensspec-parms.txt";

$CLUSTER_HEADER = "cluster";
$SUBJECT_HEADER = "subject";

$CELLBYCELL = "cell-by-cell";

$NMAX4EXACTCI = 500;
@Z = (80, 1.281552, 90, 1.644854, 95, 1.959964, 99, 2.575829); # normal distribution quantiles
# IMP: order in levels listed above must me the same as in table $PROP_CI

$ERRORW = 50; # Error message width (to be displayed in Visual Basic interface)

# **************************************************************************************
# Arbitrary but different fatal error codes 
#
# (WARNING: if changed, Visual Basic code must also change. [Thus: don't change!!])

$FATAL_CHDIR           =  1;
$FATAL_REPEATEDPATIENT =  2;
$FATAL_NONSPOLIGO      =  3;
$FATAL_UNEVENSTRAINS   =  4;
$FATAL_VB              =  5;
$FATAL_VBREAD          =  6;
$FATAL_INVALIDFILENAME =  7;
$FATAL_FILENOTREADABLE =  8;
$FATAL_CANNOTWRITE     =  9;
$FATAL_STATTABLE       = 10;
$FATAL_MKTMP           = 11;
$FATAL_NODATA          = 12;
$FATAL_NOHEADER        = 13;


# Integer values corresponding to the order in which they appear in VB list
# -------------------------------------------------------------------------
# (for DistancesListVersion)

$LONG  = 0;
$SHORT = 1;
$NONE  = 2;


# --- Start pgm --------------------------------------------------------------------------------------------------


if (__FILE__ =~ /(.*)\\/)
{
  $HOME =  $1; # directory where this pgm sits
  chdir($HOME) || &FatalError($FATAL_CHDIR, $HOME);
}
else
{
  $HOME = $CWD;
}

unlink($FATAL, $SUCCESS);


# ----------------------------------------------------------------
#
# Read Visual Basic Interface Output to get Problem description
#
# (names of input/output files, genetic distance, etc)


open(TMP, $VB_OUT) || &FatalError($FATAL_VBREAD);
@vb_out = (<TMP>);
close TMP;
unlink($VB_OUT);
chomp @vb_out;
  
($inputfile, $outputfile, $ndiff, $cilevel, $DistancesListVersion, $PrintDistancesMatrices, $method) = @vb_out;
  
$method                 =~ s/ //g; # method to be used for strains differences count
$ndiff                  =~ s/ //g;
$cilevel                =~ s/\%//; # remove the % sign
$DistancesListVersion   =~ s/ //g;
$PrintDistancesMatrices =~ s/ //g;
  
# --- Input filename
  
&FatalError($FATAL_INVALIDFILENAME, $inputfile) unless -f $inputfile;
&FatalError($FATAL_FILENOTREADABLE, $inputfile) unless -r _;
  
&IsDataClustered($inputfile); # will define $IS6110CLUSTER & $SPOLIGO, and fail if there is either no data to analyze or no file header (a line that 'looks like' column titles)
  
# -- Output filename: derive name for short-version output
  
($tmp) = &FilePath($outputfile);
  
$outputfile .= ".doc" unless $tmp =~ /(.*)\./;


# ---- Build clusters -------------------------------------
  
$SPEC = $IS6110CLUSTER ? 0 : 1;
  
@out = &ReadDataAndBuildClusters($inputfile, $method, $ndiff);
  
# Line above will return, amongst other things:
# $Samplesize @ClustersList %ClusterSize %NewClusterIds $NSize1

  
# Compute some desc stats about new clusters' sizes
  
$NSize1 = 0 unless $NSize1;
foreach $cluster (@ClustersList)
{
  push(@clusterssizes2plus, $ClusterSize{$cluster});
}

  
# --- Compute confidence interval for either sensitivity or specificity
  

$nclustered = $Samplesize - $NSize1;
  
$numerator = $SPEC ? ($Samplesize - $nclustered) : $nclustered;
  

# Read normal distrn quantiles for pre-determined levels
# whichZ will be used to decide which columns of tables are to be read.
  
while (@Z && !$Z)
{
  ($p, $z, @Z) = @Z;
  $Z = $z if $p == $cilevel;
  $whichZ++;
}


if ($Samplesize > $NMAX4EXACTCI) 
{
  # compute CI approximation

  $p = $numerator / $Samplesize;
  $margin = $Z * sqrt($p*(1-$p)/$Samplesize);
  ($cilower, $ciupper) = ($p - $margin, $p + $margin);
}
else
{
  ($m, $ci2reverse) = ($numerator, 0);
  ($m, $ci2reverse) = ($Samplesize-$m, 1) if $m > ($Samplesize-$m);
  $table = $PROP_CI;
  
  $nlines2ignore = ($Samplesize%2 == 0) ? ($Samplesize*$Samplesize + 2*$Samplesize - 4) / 4 : ($Samplesize + 3) * ($Samplesize - 1) / 4;
  
  open(TABLE, $table) || &FatalError($FATAL_STATTABLE, $table);

  while ($tmp = <TABLE>)
  {
    next if $. <= $nlines2ignore;
    chomp;
    @tmp = split(" ", $tmp);
    if ($tmp[0] == $m)
    {
      ($cilower, $ciupper) = splice(@tmp, 2*$whichZ, 2);
      foreach ($cilower, $ciupper)
      {
        $_ = "0.$_" unless $_ eq "0";
      }
      last;
    }
  }
  close TABLE;
  
  ($cilower, $ciupper) = (1-$ciupper, 1-$cilower) if $ci2reverse;
}
  
  
# Prepare outputs headers -------------------------

  
open(OUTPUT, "> $outputfile") || &FatalError($FATAL_CANNOTWRITE, $outputfile);
  
($time, $date) = &TrueDate();

push(@outheader, &RAligned($date, $OUTPUT_WIDTH), &RAligned($time, $OUTPUT_WIDTH), "");
push(@outheader, "File: $inputfile");
  
$txt = "Number of differences between spoligotypes: ";
$txt .= (!$method || $method eq $CELLBYCELL) ? "each missing spacer" : "contiguous missing spacers";
$txt .= " = 1 deletion";
push(@outheader, $txt);
  
$tmp = ">> Identical";
if ($ndiff > 0)
{
  $tmp .= " + $ndiff difference";
  $tmp .= "s" if $ndiff > 1;
}
push(@outheader, "$tmp <<", "");

  
# --- Output summaries -----------------------------------------------
  
  
$point_estimate = sprintf("%.4f", $numerator/$Samplesize);
$notclustered = $Samplesize - $nclustered;
push(@outheader, "Number of subjects: $Samplesize");
  
push(@outheader, "# of IS6110 clusters: $NIS6110CLUSTERS") if $NIS6110CLUSTERS;
  
  
push(@outheader, "# of subjects clustered: $nclustered / not clustered: $notclustered", "");
  
  
$parm = $SPEC ? "specificity" : "sensitivity";
$cilower = sprintf("%.4f", $cilower);
$ciupper = sprintf("%.4f", $ciupper);
  
push(@outheader, "$parm: $point_estimate   $cilevel%-c.i.: $cilower-$ciupper", "");
  
  
# ---- Cluster sizes -------
  
($n, $mean, $std, $min, $max, $p25, $p50, $p75, $mode) = &DescriptiveStats(@clusterssizes2plus); # ok pato
  
$txt = "New clusters' sizes -- descriptive statistics (for clusters with size > 1)";
push(@outheader, $txt);
push(@outheader, "-" x length($txt), "");
  
$mean = sprintf("%.2f", $mean);
$std  = sprintf("%.2f", $std);
$p25 = sprintf("%.2f", $p25);
$p50 = sprintf("%.2f", $p50);
$p75 = sprintf("%.2f", $p75);
  
push(@outheader, &LAligned("N:", 10) . " $n");
push(@outheader, &LAligned("Avg (sd):", 10) . " $mean ($std)");
push(@outheader, &LAligned("Range:", 10) . " $min-$max");
push(@outheader, &LAligned("Median:", 10) . " $p50");
push(@outheader, &LAligned("Mode:", 10) . " $mode");
push(@outheader, &LAligned("IQR:", 10) . " $p25-$p75");
push(@outheader, "");
push(@outheader, "(# of clusters of size 1: $NSize1)");
push(@outheader, "");

push(@outheader, &AvgDiffs());

# Write outputs headers -------------------------
  
push(@outheader, "");
foreach (@outheader)
{
  print OUTPUT "$_\n";
}
  
  
# -- Subjects description


@tmp = &ArrangedDistancesList(@out);
foreach (@tmp)
{
  print OUTPUT "$_\n";
}


# --- Write distances matrix


if ($PrintDistancesMatrices)
{
  @tmp = sort {$a <=> $b} keys %Cluster0Ids;
  foreach $cluster0 (@tmp)
  {
    @ids = split(" ", $Cluster0Ids{$cluster0});
    @dmatrix = &DistancesMatrix($cluster0, @ids);

    foreach (@dmatrix)
    {
      print OUTPUT "$_\n";
    }
  }
}

close OUTPUT;

# Write a (empty) file to indicate VB that this Perl pgm was successful

open(SUCCESS, ">$SUCCESS");
close SUCCESS;

# --- End of code -----------------------------------------------------------------------------------------
# --- Library of functions --------------------------------------------------------------------------------


sub Add2Out()
{
  my ($m, $header, $sep, $width, @values) = @_;
  my (@tmp, $txt, $tmp);

  @tmp = splice(@values, 0, $m);
  $txt = $header;

  foreach $tmp (@tmp)
  {
    $txt .= $sep . &RAligned($tmp, $width)
  }

  ($txt, @values);
}


sub ArrangedDistancesList()
{
  my (@out) = @_;
  my ($is6110cluster, $lis6110cluster, $mycluster, $s1, $s2, $ls1, $diff,
      $clength0, $clength, $slength0, $slength, $ldiff,
      $Identical, $spaces4indentical, $header,
      $lstrain, $width2identical, $tmp, $dummy, $marginb4identical, $out, $currentline, $next,
      %printedstrain,
      @tmp, @header, @res);

  $marginb4identical = "  ";

  ($clength, $slength) = &CSLength(@out);
  $slength0 = length($SUBJECT_HEADER);
  $slength0 = $slength if $slength > $slength0;
  $clength0 = length($CLUSTER_HEADER);
  $clength0 = $clength if $clength > $clength0;


  # Rearrange strains in a printable fashion

  foreach $tmp (@out)
  {
    ($dummy, $s1) = split(" ", $tmp);
    next if $printedstrain{$s1};

    @tmp = split(" ", $strain{$s1});
    foreach $tmp (@tmp)
    {
      $tmp = "($tmp)" if length($tmp) > 1;
    }
    $printedstrain{$s1} = join("", @tmp);
    $printedstrain{$s1} =~ tr/01/_*/ if $SPOLIGO;
  }


  $header = &RAligned($CLUSTER_HEADER, $clength0) . " " . &RAligned($SUBJECT_HEADER, $slength0);
  $width2identical = length($header) + length($marginb4identical);
  @header = ($header, "-" x $clength0 . " " . "-" x $slength0, "");

  $spaces4indentical = $OUTPUT_WIDTH - $width2identical;

  $ls1 = -1;

  # Write down report


  while (@out)
  {
    $out = shift @out;

    ($mycluster, $s1, $s2, $diff) = split(" ", $out);
    $is6110cluster = $` if $mycluster =~ /-/;

    if (length($is6110cluster) && $is6110cluster ne $lis6110cluster)
    {
      if ($currentline)
      {
        push(@res, $currentline);
        $currentline = "";
      }

      push(@res, "") if $lis6110cluster >= 0;
      push(@res, "=" x $OUTPUT_WIDTH);
      push(@res, "IS6110 Cluster # $is6110cluster");
      push(@res, "=" x $OUTPUT_WIDTH);
      push(@res, @header);
    }
    elsif (length($is6110cluster) == 0 && @header)
    {
      push(@res, @header);
      @header = ();
    }

    if ($NumericIdno{$s1} != $NumericIdno{$ls1})
    {
      push(@res, $currentline) if $currentline;
      push(@res, "") if $ls1 >= 0;

      $tmp  = &RAligned($mycluster, $clength0);
      $tmp .= " " . &RAligned($s1, $slength0);
      $tmp .= $marginb4identical . "strain = $printedstrain{$s1}";
      push(@res, $tmp);
      $currentline = "";
      $ldiff = -1;
    }


    if ($diff == $ldiff)
    {
      $tmp = $currentline;
      $next = &RAligned($s2, $slength+1);
      $currentline .= $next;
      if (length($currentline) > $OUTPUT_WIDTH)
      {
        push(@res, $tmp);
        $currentline = " " x $width2identical . $next;
      }
    }
    else
    {
      push(@res, $currentline) if $currentline;

      $Identical = ($diff == 0) ? "Identical" : "Identical+$diff";
      $Identical = "-" x (($spaces4indentical-length($Identical))/2) . $Identical;
      $Identical .= "-" x ($spaces4indentical-length($Identical));

      push(@res, " " x $width2identical . $Identical);
      $currentline = &RAligned($s2, $width2identical + $slength + 1);
    }

    $ldiff = $diff;

    ($lis6110cluster, $ls1) = ($is6110cluster, $s1);
  }

  push(@res, $currentline) if $currentline;
  @res;
}


sub AvgDiffs()
{
  my ($txt, $width, $width0, $sep, $lsep,
      $n, $mean, $sd, $min, $max, $mode, $range, $iqr0, $iqr1, $p25, $p50, $p75, $size,
      $underline, $nline, $meanline, $sdline, $rangeline, $medianline, $modeline,
      $iqr0line, $iqr1line, $sizeline,
      $nline_h, $meanline_h, $sdline_h, $rangeline_h, $medianline_h, $modeline_h,
      $iqr0line_h, $iqr1line_h,$sizeline_h,
      $kluster, $output_nparts, $k, $mclusters, $clustersnumbers_h, $tmp, $out,
      $id1, $id2, $index,
      %n, %mean, %sd, %range, %median, %mode, %iqr0, %iqr1,
      @clusters, @out, @tmp, @res, @diff, @ids);

  $width  = 5;
  $width0 = 15;
  $lsep   = 2;

  $sep = " " x $lsep;

  $txt = "Average # of differences between subjects (within new clusters)";
  push(@res, $txt);
  push(@res, "=" x length($txt), "");

  $clustersnumbers_h = &LAligned("New cluster #", $width0);

  $nline_h      = &LAligned("N:", $width0);
  $meanline_h   = &LAligned("mean:", $width0);
  $sdline_h     = &LAligned("sd:", $width0);
  $rangeline_h  = &LAligned("range:", $width0);
  $medianline_h = &LAligned("median:", $width0);
  $modeline_h   = &LAligned("mode:", $width0);
  $iqr0line_h   = &LAligned("IQR:", $width0);
  $iqr1line_h   = " " x $width0;
  $sizeline_h   = &LAligned("size:", $width0);


  @clusters = @ClustersList;

  foreach $kluster (@clusters)
  {
    @diff = ();
    @ids = split(" ", $NewClusterIds{$kluster});
   
    while (@ids)
    {
      $id1 = shift @ids;

      foreach $id2 (@ids)
      {
        $index = "$id1:$id2";
        push(@diff, $diff{$index});
      }
    }


    ($n, $mean, $sd, $min, $max, $p25, $p50, $p75, $mode) = &DescriptiveStats(@diff);

    $n{$kluster} = $n;

    $mean = sprintf("%.2f", $mean);
    $mean{$kluster} = $mean;

    $sd = sprintf("%.2f", $sd);
    $sd{$kluster} = $sd;

    $range = "$min-$max";
    $range{$kluster} = $range;

    $p50 = sprintf("%.2f", $p50);
    $median{$kluster} = $p50;

    $mode{$kluster} = $mode;

    $iqr0{$kluster} = "$p25-";
    $iqr1{$kluster} = $p75;
  }


  if (@clusters)
  {
    $mclusters = ($OUTPUT_WIDTH - $width0);
    $mclusters = ($mclusters - $mclusters%($width+$lsep)) / ($width+$lsep);

    while (@clusters)
    {
      push(@out, "", "") if @out;
      $output_nparts++;

      $txt = $clustersnumbers_h;
      $underline = " " x $width0;
      @tmp = splice(@clusters, 0, $mclusters);

      $nline = $nline_h;
      $meanline = $meanline_h;
      $sdline = $sdline_h;
      $rangeline = $rangeline_h;
      $medianline = $medianline_h;
      $modeline = $modeline_h;
      $iqr0line = $iqr0line_h;
      $iqr1line = $iqr1line_h;
      $sizeline = $sizeline_h;

      foreach $tmp (@tmp)
      {
        $txt        .= $sep . &RAligned($tmp, $width);
        $underline  .= $sep . "-" x $width;
        $nline      .= $sep . &RAligned($n{$tmp}, $width);
        $meanline   .= $sep . &RAligned($mean{$tmp}, $width);
        $sdline     .= $sep . &RAligned($sd{$tmp}, $width);
        $rangeline  .= $sep . &RAligned($range{$tmp}, $width);
        $medianline .= $sep . &RAligned($median{$tmp}, $width);
        $modeline   .= $sep . &RAligned($mode{$tmp}, $width);
        $iqr0line   .= $sep . &RAligned($iqr0{$tmp}, $width);
        $iqr1line   .= $sep . &RAligned($iqr1{$tmp}, $width);
        $size        = (sqrt(8*$n{$tmp}+1)+1)/2;
        $sizeline   .= $sep . &RAligned($size, $width);
      }
      push(@out, $txt, $underline, "", $nline, $meanline, $sdline, $rangeline, $medianline, $modeline, $iqr0line, $iqr1line, $sizeline);
    }


    while (@out)
    {
      $tmp = shift @out;
      push(@res, $tmp);

      if ($output_nparts > 1 && $tmp =~ /$clustersnumbers_h/o)
      {
        $k++;
        $out = shift @out;

        $tmp = "[part $k of $output_nparts]";
        substr($out, 0, length($tmp)) = $tmp;
        push(@res, $out);
      }
    }
  }
  else
  {
    push(@res, "(No new cluster of size > 1.)");
  }

  push(@res, "", "");

  @res;
}


sub CSLength()
{
  my ($clength, $slength, $l, @tmp);

  foreach (@_)
  {
    @tmp = split;
    $l = length($tmp[2]);
    $slength = $l if $l > $slength;
    $l = length($tmp[1]);
    $clength = $l if $l > $clength;
  }

  ($clength, $slength);
}


sub DescriptiveStats()
{
  my (@x) = @_;
  my ($x, $sum, $sum2, $mean, $median, $sd, $mode, $n, $min, $max, $q1, $q3, $tmp, $fmode,
      @modes,
      %freq);

  if (@x)
  {
    $n = $#x + 1;

    @x = sort {$a <=> $b} @x;
    ($min, $max) = ($x[0], $x[$#x]);

    if ($n%2)
    {
      $median = $x[($n-1)/2];
    }
    else
    {
      $tmp = $n / 2;
      $median = ($x[$tmp] + $x[$tmp-1]) / 2;
    }

    foreach $x (@x)
    {
      $sum += $x;
      $sum2 += $x * $x;
      $freq{$x}++;
    }

    $mean = $sum / $n;
    $sd = ($n > 1) ? sqrt(($sum2 - $sum*$sum/$n) / ($n - 1)) : ".";

    # --- compute Q1 and Q3 as in sas

    if ($n%4)
    {
      $tmp = sprintf("%d", $n/4);
      $q1 = $x[$tmp];
      $tmp = sprintf("%d", 3*$n/4);
      $q3 = $x[$tmp];
    }
    else
    {
      $tmp = $n/4;
      $q1 = ($x[$tmp] + $x[$tmp-1]) / 2;
      $tmp *= 3;
      $q3 = ($x[$tmp] + $x[$tmp-1]) / 2;
    }

    # --- compute mode (return . if all values have the same count)

    @x = sort {$freq{$b} <=> $freq{$a}} keys %freq;

    push(@modes, shift @x);
    $fmode = $freq{$modes[0]};

    if (@x && $fmode == $freq{$x[$#x]})
    {
      $mode = ".";
    }
    else
    {
      push(@modes, shift @x) while @x && $freq{$x[0]} == $fmode;
      @modes = sort {$a <=> $b} @modes;
      $mode = join(";", @modes);
    }

    ($n, $mean,  $sd, $min, $max, $q1, $median,  $q3, $mode);
  }
  else
  { 
    (0,   "NA", "NA", "NA", "NA", "NA",   "NA", "NA",  "NA");
  }
}


sub Diff()
{
  my ($method, $strain1, $strain2) = @_;
  my (@strains, $diff, $x1, $x2);

  $diff = 0;

  @strains = reverse split(" ", $strain2);
  unshift(@strains, split(" ", $strain1));

  if ($method eq $CELLBYCELL)
  {
    while (@strains)
    {
      ($x1, $x2) = (shift @strains, pop @strains);
      $diff++ unless $x1 eq $x2;
    }
  }
  else
  {
    # $method eq "sequence"

    while (@strains)
    {
      ($x1, $x2) = (shift @strains, pop @strains);

      # -- Point 1. In a sequence without differences --

      ($x1, $x2) = (shift @strains, pop @strains) while $x1 == $x2 && @strains;

      # -- Point 2. In a sequence with differences    --

      if ($x1 != $x2)
      {
        $diff++;
        last unless @strains;
        if ($x1)
        {
          @strains = reverse @strains;
        }
        while (@strains)
        {
          ($x1, $x2) = (shift @strains, pop @strains);
          if ($x1)
          {
            unshift(@strains, $x1);
            push(@strains, $x2);
            last;
          }
        }
      }
    }
  }

  $diff;
}


sub Dir()
{
  my ($path) = @_;
  my ($backslash);

  $backslash = rindex($path, "\\");
  ($backslash >= 0) ? substr($path, 0, $backslash) : "";
}


sub DistancesMatrix()
{
  my ($cluster0, @ids) = @_;
  my (@head, @tmp, @col_labels, @cols, @thispage, @dmatrix, @r, @c,
      $head, $c, $lmaxlabel, $headw, $lmaxdiff, $sep,
      $tmp, $tmp1, $tmp2, $index, $nrows, $ncols, $colw, $r, $lastr, $star, $thisline);

  @ids = sort {$NumericIdno{$a} <=> $NumericIdno{$b}} @ids;
  $lmaxlabel = length($ids[$#ids]);

  # compute maximum length of diff for this table

  @tmp = @ids;
  while (@tmp)
  {
    $tmp1 = shift @tmp;
    foreach $tmp2 (@tmp)
    {
      $index = "$tmp1:$tmp2";
      $tmp = $diff{$index};
      $tmp = length($tmp);
      $lmaxdiff = $tmp if $tmp > $lmaxdiff;
    }
  }

  $nrows = $NLPP - $lmaxlabel - 2;
  $colw = $MATRIXCOLSEP + $lmaxdiff;
  $ncols = $OUTPUT_WIDTH - $lmaxlabel - $MATRIXLMARGIN - 1;
  $ncols = ($ncols-$ncols%$colw)/$colw;
  $sep = " " x ($colw - 1);

  while (@ids)
  {
    @cols = reverse @ids;
    pop @cols while $NumericIdno{$cols[$#cols]} <= $NumericIdno{$ids[0]};
    @r = splice(@ids, 0, $nrows);
    pop @r unless @ids; # no line for last subject

    while (@cols)
    {
      @c = splice(@cols, 0, $ncols);

      # Construct page header

      @head = ();
      @col_labels = @c;
      while ()
      {
        @tmp = ();
        foreach $c (@col_labels)
        {
          if ($c)
          {
            push(@tmp, chop $c);
          }
          else
          {
            push(@tmp, " ");
          }
        }

        $head = join($sep, "", @tmp);
        last unless $head =~ /\S/;
        $head =~ s/\s+$//; # rtrim
        unshift(@head, $head);
      }

      $headw = length($head[$#head]);
      foreach $head (@head)
      {
        $head = " " x ($lmaxlabel+$MATRIXLMARGIN+1) . $head;
      }

      $head = ">>> ";
      $head .= $IS6110CLUSTER ? "Cluster # $cluster0: " : "";
      $head .= " ($r[0]-_LASTR_) x ($c[$#c]-$c[0]) <<<";
      unshift(@head, $head);

      push(@head, " " x ($lmaxlabel+$MATRIXLMARGIN) . "+" . "-" x $headw);

      # Build diff diagonal matrix

      @thispage = ();
      foreach $r (@r)
      {
        last if $NumericIdno{$r} >= $NumericIdno{$c[0]};
        $lastr = $r;
        $thisline = &RAligned($r, $lmaxlabel) . " " x $MATRIXLMARGIN . "|";
        foreach $c (@c)
        {
          last if $c eq $r;
          $index = "$r:$c";
          $tmp = $diff{$index};
          $star = ($Cluster{$c} eq $Cluster{$r}) ? $SAMECLUSTERCHAR : "";
          $thisline .= &RAligned($star . $tmp, $colw);
        }
        push(@thispage, $thisline);
      }

      $head[0] =~ s/_LASTR_/$lastr/;
      $head[0] = "\f" . &RAligned($head[0], $OUTPUT_WIDTH);
      foreach $tmp (@head, @thispage)
      {
        push(@dmatrix, $tmp);
      }
    }
  }

  @dmatrix;
}


sub FatalError()
{
  my ($code, $txt) = @_;
  my (@fatal);

  chomp $code;

  if ($code =~ /\D/)
  {
    @fatal = ($code);
  }
  elsif ($code == $FATAL_CHDIR)
  {
    @fatal = ("Could not change directory to", &MySplit($txt, $ERRORW, "\\"));
  }
  elsif ($code == $FATAL_REPEATEDPATIENT)
  {
    @fatal = ("Subject # $txt found twice in input file", &MySplit($inputfile, $ERRORW, "\\", 1), "", "Please review data before re-submitting.");
  }
  elsif ($code == $FATAL_NONSPOLIGO)
  {
    @fatal = &MySplit("Strain for subject # $txt contains characters other than g's and i's. Please review data before re-submitting.", $ERRORW, " ", 1);
  } 
  elsif ($code == $FATAL_UNEVENSTRAINS)
  {
    @fatal = &MySplit("Uneven strains; subject # $txt has strain of different lengths than preceding subjects in input file. Please review data before re-submitting.", $ERRORW, " ", 1);
  }
  elsif ($code == $FATAL_VB)
  {
    @fatal = &MySplit("Perl subprogram could not find Visual Basic interface's output at its expected location.", $ERRORW, " ", 1);
  }
  elsif ($code == $FATAL_VBREAD)
  {
    @fatal = &MySplit("Perl subprogram could not read Visual Basic interface's output.", $ERRORW, " ", 1);
  }
  elsif ($code == $FATAL_INVALIDFILENAME)
  {
    @fatal = &MySplit("$txt: not a valid file name.", $ERRORW, "\\", 1);
  }
  elsif ($code == $FATAL_FILENOTREADABLE)
  {
    @fatal = &MySplit("$txt: not a readable file.", $ERRORW, "\\", 1);
  }
  elsif ($code == $FATAL_CANNOTWRITE)
  {
    @fatal = ("Cannot write to output to file",  &MySplit($txt, $ERRORW, "\\", 1));
  }
  elsif ($code == $FATAL_STATTABLE)
  {
    @fatal = ("Cannot read statistical table (for confidence interval limits); file", &MySplit($txt, $ERRORW, "\\", 1), "not found.");
  }
  elsif ($code == $FATAL_MKTMP)
  {
    @fatal = ("Could not make temporary directory tmp.");
  }
  elsif ($code == $FATAL_NODATA)
  {
    @fatal = ("No data found in input file", &MySplit($txt, $ERRORW, "\\"));
  }
  elsif ($code == $FATAL_NOHEADER)
  {
    @fatal = ("Input file", &MySplit($txt, $ERRORW, "\\"), "", "does not have a header (column titles/variable names).");
  }
  
  open(FATAL, ">$FATAL");
  foreach (@fatal)
  {
    print FATAL "$_\n";
  }
  close FATAL;

  die "\n";
}


sub FilePath()
{
  my ($file) = @_;
  my ($path);

  $file =~ s{/}{\\}g;
  $file =~ s{\\+}{\\}g;
  ($file, $path) = ($', substr($&,0,-1)) if $file =~ /.*\\/;

  ($file, $path);
}


sub IsDataClustered()
{
  my ($filename) = @_;
  my (@tmp, $dat, $header, $headerlf, $noheader);

  # Read first two non-empty lines of file to define IS6110CLUSTER & SPOLIGO

  open(DAT, $filename) || &FatalError($FATAL_FILENOTREADABLE, $filename);

  while ($dat = <DAT>)
  {
    next unless $dat =~ /\S/;
    chomp $dat;
    $dat =~ s/\t/ /g;
    $dat =~ s/ +/ /g;
    $dat =~ s/\r//g;
    $dat =~ s/\s+$//;      # rtrim
    $dat =~ s/\s*(\S)/\1/; # ltrim
    $dat = lc $dat;
    last if $header;
    $header = $dat; # 1st non-empty data line
  }
  close DAT;

  &FatalError($FATAL_NODATA, $filename) unless $dat;

  @tmp = split(" ", $header);
  $IS6110CLUSTER = ($tmp[0] =~ /cluster/) ? 1 : 0;

  # Make sure that the 1st row was a header line, that is, that it consists not only of numbers 
  # (where last field is ignored if it consists of only g's and i's)

  $headerlf = pop @tmp;
  $header = join("", @tmp);

  unless ($header =~ /\D/)
  {
    if ($headerlf =~ /[gi]/)
    {
      $headerlf =~ s/[gi]//g;
      $noheader = 1 unless length($headerlf);
    }
    else
    {
      $noheader = 1 unless $headerlf =~ /\D/;
    }

    &FatalError($FATAL_NOHEADER, $filename) if $noheader;
  }

  # See whether data are spoligo (g's and i's) or not

  @tmp = split(" ", $dat);
  $SPOLIGO = ($tmp[$#tmp] =~ /[gi]/) ? 1 : 0;
}


sub IsInteger()
{
  my ($x) = @_;

  $x =~ s/[0-9]//g;
  length($x) == 0 ? 1 : 0;
}


sub LAligned()
{
  my ($txt, $width) = @_;

  $txt . " " x ($width-length($txt));
}


sub MySplit()
{
  my ($txt, $width, $split_char, $wider_forbidden) = @_;
  my (@res, $where);
  
  # Pour defaire une ligne de texte en plusieurs, en respectant une largeur de ligne maximale
  # et en utilisant un caractere determine a l'appel comme lieu de coupure potentielle
  #
  # si wider_forbidden = 1, alors il est permis que la largeur d'une ligne depasse la largeur prescrite
  #                         sinon, la ligne sera coupee a la largeur prescrite, meme si on ne trouve
  #                         plus de split_char pour faire de coupure (la partie tronquee sera reportee
  #                         sur la ligne suivante)


  while (length($txt) > $width)
  {
    $where = rindex($txt, $split_char, $width-1);
    if ($where >= 0)
    {
      push(@res, substr($txt, 0, $where+1));
      $txt = substr($txt, $where+1);
      next;
    }
    
    unless ($wider_forbidden)
    {
      $where = index($txt, $split_char);
      if ($where >= 0)
      {
        push(@res, substr($txt, 0, $where+1));
        $txt = substr($txt, $where+1);
        next;
      }
    }

    $where = rindex($txt, " ", $width-1);
    if ($where >= 0)
    {
      push(@res, substr($txt, 0, $where+1));
      $txt = substr($txt, $where+1);
      next;
    }
    
    unless ($wider_forbidden)
    {
      $where = index($txt, " ");
      if ($where >= 0)
      {
        push(@res, substr($txt, 0, $where+1));
        $txt = substr($txt, $where+1);
        next;
      }
    }
     
    last unless $wider_forbidden;
    
    push(@res, substr($txt, 0, $width));
    $txt = substr($txt, $width);
  }

  push(@res, $txt) if $txt =~ /\S/;

  join("\n", @res);
}


sub RAligned()
{
  my ($txt, $width) = @_;

  " " x ($width-length($txt)) . $txt;
}


sub ReadDataAndBuildClusters()
{
  my ($infile, $method, $ndiff) = @_;
  my (@dat, @tmp, @clusters0, @ids, @idsthiscluster0, @idsthiscluster,
      @tmpids, @idsb4, @res, @clusters, @out,
      $dat, $id, $cluster, $cluster0, $id0, $id1, $id2, $index, $diff, $alphanum_ids,
      $nclusters, $in, $kluster0, $kluster, $tmp, $n, $c, $nnewclusters, $strainlength, $k,
      %exists, %newclusterno, %newclusterrank,
      %definedminid4cluster, %cluster0, %clustersize, %minid4cluster, %cluster, %tmpindex);


  # global hashes: %Cluster %diff %Cluster0Ids %ClusterSize @ClustersList %NewClusterIds %NumericIdno

  open(DAT, $infile) || &FatalError($FATAL_FILENOTREADABLE, $infile);

  while ($dat = <DAT>)
  {
    chomp $dat;
    $dat =~ s/\t/ /g;
    $dat =~ s/\r/ /g;
    $dat =~ s/ +/ /g;
    $dat =~ s/\s+$//;      # rtrim
    $dat =~ s/\s*(\S)/\1/; # ltrim
    push(@dat, $dat) if $dat;
  }
  close DAT;
  
  @tmp = split(" ", $dat[0]);
  shift @dat if $tmp[0] =~ /[a-z]/i;
  
  foreach $dat (@dat)
  {
    @tmp = split(" ", $dat);
    if (@tmp)
    {
      $cluster0 = $IS6110CLUSTER ? shift @tmp : -1;
      $id = shift @tmp;
      $alphanum_ids = 1 if $id =~ /\D/;
      &FatalError($FATAL_REPEATEDPATIENT, $id) if $exists{$id};
      $exists{$id} = 1;
      $cluster0{$id} = $cluster0;
      $strain{$id} = join(" ", @tmp);
  
      if ($SPOLIGO)
      {
        &FatalError($FATAL_NONSPOLIGO, $id) if $tmp[0] =~ /[^gi]/;
        $tmp[0] =~ tr/gi/10/;
        @tmp = split("", $tmp[0]);
        $strain{$id} = join(" ", @tmp);
      }
  
      if ($strainlength)
      {
        &FatalError($FATAL_UNEVENSTRAINS, $id) if $strainlength != ($#tmp + 1);
      }
      else
      {
        $strainlength = $#tmp + 1;
      }
    }
  }
  
  @clusters0 = &Unique(values %cluster0);
  $NIS6110CLUSTERS = $IS6110CLUSTER ? ($#clusters0 + 1) : 0;

  # Give a unique numeric id to each patient, wheter or not their idnos (in data set) is numeric

  if ($alphanum_ids)
  {
    @ids = sort {$a cmp $b} keys %cluster0;
  }
  else
  {
    @ids = sort {$a <=> $b} keys %cluster0;
  }    
  $tmp = 0;
  foreach (@ids)
  {
    $tmp++;
    $NumericIdno{$_} = $tmp;
  }

  
  @ids = sort {$cluster0{$a} <=> $cluster0{$b} || $NumericIdno{$a} <=> $NumericIdno{$b}} keys %cluster0;
  $n = $#ids + 1;
  $Samplesize = $n; # global variable
  
  while (@ids)
  {
    $id = shift @ids;
    $cluster0 = $cluster0{$id};
    $kluster0 = "$cluster0-" if $IS6110CLUSTER;
    @idsthiscluster0 = ($id);
    push(@idsthiscluster0, shift @ids) while @ids && $cluster0{$ids[0]} == $cluster0;
  
    $Cluster0Ids{$cluster0} = join(" ", @idsthiscluster0);
  
    # --- Compute distances between each pair of subjects
  
    @tmpids = @idsthiscluster0;
    while (@tmpids)
    {
      $id1 = shift @tmpids;
      foreach $id2 (@tmpids)
      {
        $diff = &Diff($method, $strain{$id1}, $strain{$id2});
        $index = "$id1:$id2";
        $diff{$index} = $diff;
      }
    }
  
    # --- Build clusters
  
    @tmpids = @idsthiscluster0;
    $id = shift @tmpids;
    $nclusters = 1;
    $cluster{$id} = 1;
    $clustersize{$cluster{$id}} = 1;
  
    while (@tmpids)
    {
      $id1 = shift @tmpids;
      @idsb4 = sort {$cluster{$a} <=> $cluster{$b} || $NumericIdno{$a} <=> $NumericIdno{$b}} keys %cluster;
  
      while (@idsb4)
      {
        $id0 = shift @idsb4;
        @idsthiscluster = ($id0);
        $index = "$id0:$id1";
        $in = ($diff{$index} <= $ndiff) ? 1 : 0;
        while (!$in && @idsb4 && $cluster{$idsb4[0]} == $cluster{$id0})
        {
          $id = shift @idsb4;
          push(@idsthiscluster, $id);
          $index = "$id:$id1";
          $in = 1 if $diff{$index} <= $ndiff;
        }
        if ($in)
        {
          push(@idsthiscluster, shift @idsb4) while @idsb4 && $cluster{$idsb4[0]} == $cluster{$id0};
  
          if ($cluster{$id1})
          {
            $clustersize{$cluster{$id1}} += $clustersize{$cluster{$id0}};
            delete $clustersize{$cluster{$id0}};
            foreach $id (@idsthiscluster)
            {
              $cluster{$id} = $cluster{$id1};
            }
          }
          else
          {
            $cluster{$id1} = $cluster{$id0};
            $clustersize{$cluster{$id0}}++;
          }
        }
      }
  
      unless ($cluster{$id1})
      {
        $nclusters++;
        $cluster{$id1} = $nclusters;
        $clustersize{$nclusters} = 1;
      }
    }
  
    # --- Redefine cluster labels (numbers)
  
    # (but first, find the smallest id# within each cluster [that will be used to sort clusters of same size])
  
    foreach $id (keys %cluster)
    {
      $c = $cluster{$id};
      $minid4cluster{$c} = $NumericIdno{$id} if !$definedminid4cluster{$c} || $NumericIdno{$id} < $minid4cluster{$c};
      $definedminid4cluster{$c} = 1;
    }
  
    # --
  
    @clusters = sort {$clustersize{$b} <=> $clustersize{$a} || $minid4cluster{$a} <=> $minid4cluster{$b}} keys %clustersize;
  
    $c = 1;
    foreach $cluster (@clusters)
    {
      $k = "$kluster0$c";
      $newclusterno{$cluster} = $k;
  
      if ($clustersize{$cluster} == 1)
      {
        $NSize1++;
      }
      else
      {
        push(@ClustersList, $k);
        $ClusterSize{$k} = $clustersize{$cluster};
  
        $tmp = length($k);
        $LMAXKLUSTER = $tmp if $tmp > $LMAXKLUSTER;
      }
  
      delete $clustersize{$cluster}; # do some spring cleaning
  
      $nnewclusters++;
      $newclusterrank{$k} = $nnewclusters;
  
      $c++;
    }
  
    # --- do some cleaning
  
    foreach $id (keys %cluster)
    {
      $k = $newclusterno{$cluster{$id}};
      $Cluster{$id} = $k;
  
      $NewClusterIds{$k} .= " " if length($NewClusterIds{$k});
      $NewClusterIds{$k} .= $id;
  
      delete $cluster{$id};
    }
  
  
    # --- Append infos to @out
  
  
    @idsthiscluster0 = sort {$newclusterrank{$Cluster{$a}} <=> $newclusterrank{$Cluster{$b}} || $NumericIdno{$a} <=> $NumericIdno{$b}} @idsthiscluster0;
  
    foreach $id1 (@idsthiscluster0)
    {
      foreach $id2 (@idsthiscluster0)
      {
        next if $NumericIdno{$id1} == $NumericIdno{$id2};
        $index = join(":", sort {$NumericIdno{$a} <=> $NumericIdno{$b}} ($id1, $id2));
        $tmpindex{$id2} = $index;
      }
  
      @tmpids = sort {$diff{$tmpindex{$a}} <=> $diff{$tmpindex{$b}} || $NumericIdno{$a} <=> $NumericIdno{$b}} keys %tmpindex;
  
      foreach $id2 (@tmpids)
      {
        $tmp = "$Cluster{$id1} $id1 $id2 $diff{$tmpindex{$id2}}";
  
        if ($DistancesListVersion == $LONG)
        {
          push(@out, $tmp);
        }
        elsif ($DistancesListVersion == $SHORT && $Cluster{$id1} eq $Cluster{$id2})
        {
          push(@out, $tmp);
        }
      }
  
      # clean
  
      foreach $id2 (keys %tmpindex)
      {
        delete $tmpindex{$id2};
      }
    }
  }
  
  # and finally, sort ids within each new cluster
  
  foreach $k (keys %NewClusterIds)
  {
    @ids = sort {$NumericIdno{$a} <=> $NumericIdno{$b}} split(" ", $NewClusterIds{$k});
    $NewClusterIds{$k} = join(" ", @ids);
  }
  
  @out;
}


sub TrueDate()
{
  my ($sec, $min, $hour, $mday, $mon, $year, @months);

  ($sec, $min, $hour, $mday, $mon, $year) = localtime (time);
  $year += 1900;
  @months = ("Jan", "Feb", "Mar", "Apr", "May", "Jun",  "Jul", "Aug", "Sep", "Oct", "Nov", "Dec");

  $min = "0" . $min if $min < 10;

  ("$hour:$min", "$mday $months[$mon] $year");
}


sub Unique()
{
  my (@x) = @_;
  my (@unique, $x);

  # This fct was designed to return unique values of a list of NUMERIC values only

  if (@x)
  {
    @x = sort {$a <=> $b} @x;
    push(@unique, shift @x);

    while (@x)
    {
      $x = shift @x;
      push(@unique, $x) if $x != $unique[$#unique];
    }
  }

  @unique;
}
