#!/usr/bin/env python
# Time-stamp: <2011-11-02 14:01:10 Tao Liu>

"""Description: Random sample certain number/percentage of tags.

Copyright (c) 2011 Tao Liu <taoliu@jimmy.harvard.edu>

This code is free software; you can redistribute it and/or modify it
under the terms of the Artistic License (see the file COPYING included
with the distribution).

@status: release candidate
@version: $Id$
@author:  Yong Zhang, Tao Liu
@contact: taoliu@jimmy.harvard.edu
"""

# ------------------------------------
# python modules
# ------------------------------------

import os
import sys
import logging
from subprocess import Popen,PIPE
from optparse import OptionParser
import gzip

# ------------------------------------
# own python modules
# ------------------------------------
from MACS2.OptValidator import opt_validate_randsample as opt_validate
from MACS2.cProb import binomial_cdf_inv
from MACS2.Constants import *
# ------------------------------------
# Main function
# ------------------------------------
def main():
    """The Main function/pipeline for duplication filter.
    
    """
    # Parse options...
    options = opt_validate(prepare_optparser())
    # end of parsing commandline options
    info = options.info
    warn = options.warn
    debug = options.debug
    error = options.error
    #0 check output file
    if options.outputfile:
        assert not os.path.exists(options.outputfile), "%s already exists, please check!" % options.outputfile
        outfhd = open(options.outputfile,"w")
    else:
        outfhd = sys.stdout
    
    #1 Read tag files
    info("read tag files...")
    fwtrack = load_tag_files_options (options)
    
    info("tag size = %d" % options.tsize)
    fwtrack.fw = options.tsize

    t0 = fwtrack.total
    info(" total tags in alignment file: %d" % (t0))
    if options.number:
        if options.number > t0:
            error(" Number you want is bigger than total number of tags in alignment file! Please specify a smaller number and try again!")
            error(" %.2e > %.2e" % (options.number, t0))
            sys.exit(1)
        info(" Number of tags you want to keep: %.2e" % (options.number))
        options.percentage = float(options.number)/t0*100
    info(" Percentage of tags you want to keep: %.2f%%" % (options.percentage))
    
    fwtrack.sample_percent(options.percentage/100.0)

    info(" tags after random sampling in alignment file: %d" % (fwtrack.total))

    info("Write to BED file")
    fwtrack.print_to_bed(fhd=outfhd)
    info("finished! Check %s." % options.outputfile)

def prepare_optparser ():
    """Prepare optparser object. New options will be added in this
    function first.
    
    """
    usage = "usage: %prog <-t file> [-p percent|-n number] [-o outputfile] [options]"
    description = "%prog -- Random sample certain number/percentage of tags. Method: 1. Calculate the percentage of tags needed to be kept; 2. For each chromosome, random sample certain percentage of tags."

    optparser = OptionParser(version="%prog "+RANDSAMPLE_VERSION,description=description,usage=usage,add_help_option=False)
    optparser.add_option("-h","--help",action="help",help="show this help message and exit.")
    optparser.add_option("-t",dest="tfile",type="string",
                         help="Sequencing alignment file. REQUIRED.")
    optparser.add_option("-p","--percentage",dest="percentage",type="float",
                         help="Percentage of tags you want to keep. Input 80.0 for 80%. This option can't be used at the same time with -n/--num. REQUIRED")
    optparser.add_option("-n","--number",dest="number",type="float",
                         help="Number of tags you want to keep. Input 8000000 or 8e+6 for 8 million. This option can't be used at the same time with -p/--percent. Note that the number of tags in output is approximate as the number specified here. REQUIRED")
    optparser.add_option("-o",dest="outputfile",type="string",
                         help="Output BED file name. If not specified, will write to standard output. DEFAULT: stdout",
                         default=None)
    optparser.add_option("-s","--tsize",dest="tsize",type="int",default=None,
                         help="Tag size. This will overide the auto detected tag size. DEFAULT: Not set")
    optparser.add_option("-f","--format",dest="format",type="string",
                         help="Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\". The default AUTO option will let %prog decide which format the file is. Please check the definition in 00README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE. DEFAULT: \"AUTO\"",
                         default="AUTO")
    optparser.add_option("--verbose",dest="verbose",type="int",default=2,
                         help="Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. If you want to know where are the duplicate reads, use 3. DEFAULT:2")
    return optparser
   
def load_tag_files_options ( options ):
    """From the options, load alignment tags.

    """
    options.info("read alignment tags...")
    tp = options.parser(open2(options.tfile))

    if not options.tsize:           # override tsize if user specified --tsize
        ttsize = tp.tsize()
        options.tsize = ttsize

    treat = tp.build_fwtrack()
    treat.sort()

    options.info("tag size is determined as %d bps" % options.tsize)
    return treat

def open2(path, mode='r', bufsize=-1):
    # try gzip first
    f = gzip.open(path, mode)
    try:
        f.read(10)
    except IOError:
        # not a gzipped file
        f.close()
        f = open(path, mode, bufsize)
    else:
        f.seek(0)
    return f

if __name__ == '__main__':
    try:
        main()
    except KeyboardInterrupt:
        sys.stderr.write("User interrupt me! ;-) Bye!\n")
        sys.exit(0)
    except AssertionError as e:
        sys.stderr.write(e.message+"\n")
        sys.exit(0)
