#! /usr/bin/perl
# $Id: gmap_build.pl.in 65046 2012-05-25 17:11:19Z twu $

use warnings;	

$gmapdb = "/var/cache/gmap";
$package_version = "2012-06-12";

use Getopt::Std;
use File::Copy;	

undef($opt_B);	  # binary directory
undef($opt_T);	  # temporary build directory
undef($opt_D);	  # destination directory
undef($opt_b);	  # offsetscomp basesize
undef($opt_d);	  # genome name
undef($opt_k);	  # k-mer size for genomic index (allowed: 16 or less)
undef($opt_q);	  # sampling interval for genome (default: 3)
undef($opt_S);	  # Old flag for turning off sorting
undef($opt_s);	  # Sorting
undef($opt_g);	  # gunzip files
$opt_w = 2; # waits (sleeps) this many seconds between steps.  Useful if there is a delay in the filesystem.
getopts("B:T:D:b:d:k:q:s:gw:");

if (!defined($bindir = $opt_B)) {
    $bindir = "/usr/bin";
}

if (!defined($builddir = $opt_T)) {
    $builddir = ".";
}


if (!defined($kmersize = $opt_k)) {
    print STDERR "-k flag not specified, so building with default 15-mers and base size 12-mers\n";
    $kmersize = 15;
    $basesize = 12;
} elsif (!defined($basesize = $opt_b)) {
    if ($kmersize == 15) {
	print STDERR "-k flag specified as 15, but not -b, so building with basesize == 12\n";
	$basesize = 12;
    } else {
	print STDERR "-k flag specified (not as 15), but not -b, so building with basesize == kmer size\n";
	$basesize = $kmersize;
    }
}

if (!defined($sampling = $opt_q)) {
    $sampling = 3;
}

if (!defined($dbname = $opt_d)) {
  print_usage();
  die "Must specify genome database name with -d flag.";
} elsif ($opt_d =~ /(\S+)\/(\S+)/) {
  $dbname = $2;
  if (defined($opt_D)) {
    $opt_D = $opt_D . "/" . $1;
  } else {
    $opt_D = $1;
  }
}

$dbname =~ s/\/$//;	# In case user gives -d argument with trailing slash

if (!defined($destdir = $opt_D)) {
    $destdir = $gmapdb;
}

if (defined($opt_S)) {
    print STDERR "The flag -S is now deprecated.  Using '-s none' instead.\n";
    $chr_order_flag = "-s none";
} elsif (defined($opt_s)) {
    $chr_order_flag = "-s $opt_s";
} else {
    # Default is to order genomes
    print STDERR "Sorting chromosomes in chrom order.  To turn off or sort other ways, use the -s flag.\n";
    $chr_order_flag = "";
}

if (defined($opt_g)) {
    $gunzip_flag = "-g";
} else {
    $gunzip_flag = "";
}


$genome_fasta = join(" ",@ARGV);



#####################################################################################

create_coords();
create_genome_version();
make_contig();
compress_genome();
create_index_offsets();
create_index_positions();
install_db();

exit;


#####################################################################################


sub create_coords {
    $cmd = "$bindir/fa_coords $gunzip_flag -o $builddir/$dbname.coords $genome_fasta";
    print STDERR "Running $cmd\n";
    if (($rc = system($cmd)) != 0) {
	die "$cmd failed with return code $rc";
    }
    sleep($opt_w);
    return;
}

sub check_coords_exists {
# file exists, and is not empty
    unless (-s "$builddir/$dbname.coords") {
	die "ERROR: $builddir/$dbname.coords not found.\n";
    }
    return;
}

sub create_genome_version {
    open GENOMEVERSIONFILE, ">$builddir/$dbname.version" or die $!;
    print GENOMEVERSIONFILE "$dbname\n";
    close GENOMEVERSIONFILE or die $!;
    sleep($opt_w);
    return;
}

sub make_contig {
    check_coords_exists();
    $cmd = "$bindir/gmap_process $gunzip_flag -c $builddir/$dbname.coords $genome_fasta | $bindir/gmapindex -d $dbname -D $builddir -A $chr_order_flag";
    print STDERR "Running $cmd\n";
    if (($rc = system($cmd)) != 0) {
	die "$cmd failed with return code $rc";
    }
    sleep($opt_w);
    return;
}

sub compress_genome {
    $cmd = "$bindir/gmap_process $gunzip_flag -c $builddir/$dbname.coords $genome_fasta | $bindir/gmapindex -d $dbname -F $builddir -D $builddir -G";
    print STDERR "Running $cmd\n";
    if (($rc = system($cmd)) != 0) {
	die "$cmd failed with return code $rc";
    }
    sleep($opt_w);
    return;
}

sub full_ASCII_genome {
    check_coords_exists();
    make_contig();
	
    $cmd = "$bindir/gmap_process $gunzip_flag -c $builddir/$dbname.coords $genome_fasta | $bindir/gmapindex -d $dbname -F $builddir -D $builddir -l -G";
    print STDERR "Running $cmd\n";
    if (($rc = system($cmd)) != 0) {
	die "$cmd failed with return code $rc";
    }
    sleep($opt_w);
    return;
}

sub create_index_offsets {
    $cmd = "cat $builddir/$dbname.genomecomp | $bindir/gmapindex -b $basesize -k $kmersize -q $sampling -d $dbname -F $builddir -D $builddir -O";
    print STDERR "Running $cmd\n";
    if (($rc = system($cmd)) != 0) {
	die "$cmd failed with return code $rc";
    }
    sleep($opt_w);
    return;
}

sub create_index_positions {
    $cmd = "cat $builddir/$dbname.genomecomp | $bindir/gmapindex -b $basesize -k $kmersize -q $sampling -d $dbname -F $builddir -D $builddir -P";
    print STDERR "Running $cmd\n";
    if (($rc = system($cmd)) != 0) {
	die "$cmd failed with return code $rc";
    }
    sleep($opt_w);
    return;
}

sub install_db {
    my @suffixes = (
	"chromosome", 
	"chromosome.iit", 
	"chrsubset", 
	"contig", 
	"contig.iit", 
	"genomecomp", 
	"version");
	
    if ($kmersize > $basesize) {
	push @suffixes,sprintf "ref%02d%02d%dgammaptrs",$basesize,$kmersize,$sampling;
    }
    push @suffixes,sprintf "ref%02d%02d%doffsetscomp",$basesize,$kmersize,$sampling;
    push @suffixes,sprintf "ref%02d%dpositions",$kmersize,$sampling;

    print STDERR "Copying files to directory $destdir/$dbname\n";
    system("mkdir -p $destdir/$dbname");
    system("mkdir -p $destdir/$dbname/$dbname.maps");
    system("chmod 755 $destdir/$dbname/$dbname.maps");
    foreach $suffix (@suffixes) {
	system("mv $builddir/$dbname.$suffix $destdir/$dbname/$dbname.$suffix");
	system("chmod 644 $destdir/$dbname/$dbname.$suffix");
    }

    system("rm -f $builddir/$dbname.coords");
    return;
}



sub print_usage {
  print <<TEXT1;

gmap_build: Builds a gmap database for a genome to be used by GMAP or GSNAP.
Part of GMAP package, version $package_version.

A simplified alternative to using the program gmap_setup, which creates a Makefile.

Usage: gmap_build [options...] -d <genomename> <fasta_files>

Options:
    -d STRING   genome name
    -T STRING   temporary build directory
    -D STRING   destination directory for installation (defaults to gmapdb directory specified at configure time)
    -b INT      basesize for offsetscomp (default 12)
    -k INT      k-mer value for genomic index (allowed: 16 or less, default 15)
    -q INT      sampling interval for genomoe (allowed: 1-3, default 3)
    -s STRING   Sort chromosomes using given method:
                  none - use chromosomes as found in FASTA file(s)
                  alpha - sort chromosomes alphabetically (chr10 before chr 1)
                  numeric-alpha - chr1, chr1U, chr2, chrM, chrU, chrX, chrY
                  chrom - chr1, chr2, chrM, chrX, chrY, chr1U, chrU
    -g          files are gzipped, so need to gunzip each file first
    -w INT      wait (sleep) this many seconds after each step

TEXT1
  return;
}

