#!/usr/bin/env python
# See the LICENSE file included with this software for license information.

'''

'''

import os, sys, string, random, subprocess, time, operator, math, datetime, numpy #pysam
from collections import defaultdict, namedtuple
import shutil
import multiprocessing
import shlex
from tempfile import TemporaryDirectory
import re
import logging
from logger import logger, TqdmToLogger, MIN_TQDM_INTERVAL
import argparse
import signal
from multiprocessing import Pool
from Bio import SeqIO, AlignIO
from glob import glob
from pathlib import Path


from tqdm import tqdm

__version__ = "2.0.6"
reroot_tree = True #use --midpoint-reroot
random_seeded = random.Random(42)

try:
    import dendropy
except ImportError:
    reroot_tree = False

#check for sane file names
ALIGNER_TO_IDX = {
        "mafft": "1",
        "muscle": "2",
        "fsa": "3",
        "prank": "4"
}

VERBOSE = 0
VERSION = __version__
PHI_WINDOWSIZE = 1000
TOTSEQS=0
PARSNP_DIR = sys.path[0]




########################################### Environment ############################################
try:
    os.environ["PARSNPDIR"]
    PARSNP_DIR = os.environ["PARSNPDIR"]
except KeyError:
    PARSNP_DIR = sys.path[0]
SIGINT = False

try:
    os.environ["PYTHONPATH"] = PARSNP_DIR + os.pathsep + os.environ["PYTHONPATH"]
except KeyError:
    os.environ["PYTHONPATH"] = PARSNP_DIR + os.pathsep


frozenbinary = True
application_path = ""
if getattr(sys, 'frozen', False):
    application_path = os.path.dirname(sys.executable)
elif __file__:
    application_path = os.path.dirname(__file__)
    frozenbinary = False


if frozenbinary:
   utilPath = PARSNP_DIR
   libPath = os.path.abspath(os.path.join(utilPath, "..", "lib"))
   if os.path.exists(libPath):
      oldLDPath = ""
      needToAdd = True
      if "LD_LIBRARY_PATH" in os.environ:
          oldLDPath = os.environ["LD_LIBRARY_PATH"]
          if libPath in oldLDPath:
              needToAdd = False
      elif "DYLD_FALLBACK_LIBRARY_PATH" in os.environ:
         oldLDPath = os.environ["DYLD_FALLBACK_LIBRARY_PATH"]
         if libPath in oldLDPath:
            needToAdd = False
      if needToAdd:
         os.environ["DYLD_FALLBACK_LIBRARY_PATH"] = libPath + os.pathsep + oldLDPath
         os.environ["LD_LIBRARY_PATH"] = libPath + os.pathsep + oldLDPath

# Add binaries to path
# os.environ["PATH"] += os.pathsep + os.path.join(PARSNP_DIR, "bin")

OSTYPE="linux"
p = subprocess.Popen("echo `uname`", shell=True, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
(checkStdout, checkStderr) = p.communicate()
if checkStderr != b"":
    sys.stderr.write("Warning: Cannot determine OS, defaulting to %s\n"%(OSTYPE))
else:
    OSTYPE = checkStdout.decode('utf-8').strip()

binary_type = "linux"
if OSTYPE == "Darwin":
    binary_type = "osx"
else:
    binary_type = "linux"

####################################################################################################


######################################## Utility Functions #########################################
def get_os():
    p = subprocess.Popen(
            "echo `uname`",
            shell=True,
            stdin=None,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE)
    (checkStdout, checkStderr) = p.communicate()
    if checkStderr != b'':
        OSTYPE = "Linux"
        logger.warning("Cannot determine OS, defaulting to %s"%(OSTYPE))
    else:
        OSTYPE = checkStdout.decode('utf-8').strip()
    if OSTYPE == "Darwin":
        binary_type = "osx"
    else:
        binary_type = "linux"
    return OSTYPE, binary_type


def handler(signum, frame):
    global SIGINT
    SIGINT = True
    logger.critical('Caught request to terminate by user (CTRL+C), exiting now, bye')
    sys.exit(128)

signal.signal(signal.SIGINT, handler)

def xmfa_to_maf(xmfa_path, maf_path, all_input_paths):
    sample_delim = '#'
    SeqInfo = namedtuple("SeqInfo", "cid seq_length")
    hdr_block_pattern = re.compile(r"##SequenceIndex (\d+)\n##SequenceFile (.+)\n##SequenceHeader >\s*(\S+).*\n##SequenceLength (\d+)bp")
    idx_to_fname = {}
    ref_fname = ""
    with open(xmfa_path) as xmfa_in:
        next(xmfa_in) # skip version
        line = next(xmfa_in)
        seq_count = int(re.match(r"#SequenceCount (\d+)\n", line).groups()[0])
        for i in range(seq_count):
            info_block = ""
            for _ in range(4):
                info_block += next(xmfa_in)
            parsed_info = hdr_block_pattern.match(info_block).groups()
            fname = parsed_info[1]
            if fname.endswith(".ref") and ref_fname == "":
                fname = fname[:-4]
                ref_fname = fname
            idx_to_fname[parsed_info[0]] = fname

    fname_to_info = defaultdict(list)
    for f in all_input_paths:
        fname = Path(f).name
        if fname.endswith(".ref"):
            fname = fname[:-4]
        fname_to_info[fname] = [SeqInfo(rec.id.split(" ")[0], len(rec.seq)) for rec in SeqIO.parse(f, "fasta")]

    seqid_pattern = re.compile(r'^cluster(\d+) s(\d+):p(\d+)')
    with open(maf_path, 'w') as maf_out:
        for aln in AlignIO.parse(xmfa_path, "mauve"):
            for rec_idx, rec in enumerate(aln):
                aln_len = rec.annotations["end"] - rec.annotations["start"]
                cluster_idx, contig_idx, startpos = [int(x) for x in seqid_pattern.match(rec.id).groups()]
                fname = idx_to_fname[rec.name]
                if rec_idx == 0:
                    maf_out.write(f"a cluster={cluster_idx}\n")
                elif fname == ref_fname:
                    continue
                rec_info = fname_to_info[fname][contig_idx-1]
                maf_out.write(f"s\t{'.'.join(fname.split('.')[:-1])}{sample_delim}{rec_info.cid}")
                maf_out.write(f"\t{startpos}")
                maf_out.write(f"\t{aln_len}")
                maf_out.write(f"\t{'+' if rec.annotations['strand'] == 1 else '-'}")
                maf_out.write(f"\t{rec_info.seq_length}")
                maf_out.write(f"\t{rec.seq}\n")
            maf_out.write("\n")
    return

#TODO Merge run fns
def run_phipack(query,seqlen,workingdir):
    currdir = os.getcwd()
    os.chdir(workingdir)
    command = "Profile -o -v -n %d -w 100 -m 100 -f %s > %s.out"%(seqlen,query,query)
    run_command(command, 1)
    os.chdir(currdir)

def run_fasttree(query,workingdir,recombination_sites):
    currdir = os.getcwd()
    os.chdir(workingdir)
    command = "fasttree -nt -quote -gamma -slow -boot 100 seq.fna > out.tree"
    run_command(command,1)
    os.chdir(currdir)


#TODO Merge wrappers
def parallelFtWrapper(params):
   try:
        jobID = params["jobID"]
        result = {}
        result["jobID"] = jobID
        result["status"] = 0
        run_fasttree(params["query"], params["dir"], params["recombination"])
        result["status"] = 1
        return result
   except KeyboardInterrupt:
        result["status"] = 0
        logger.info("Keyboard interrupt in thread %d, quitting\n"%(jobID))
        return result
   except Exception:
        result["status"] = 0
        logger.info("Other error in thread %d, quitting\n"%(jobID))
        return result


def parallelPhiWrapper(params):
   try:
        jobID = params["jobID"]
        result = {}
        result["jobID"] = jobID
        result["status"] = 0
        if params["seqlen"] >= 1000:
            run_phipack(params["query"],params["seqlen"],params["dir"])
            result["status"] = 1
        else:
            result["status"] = 2
        return result
   except KeyboardInterrupt:
        result["status"] = 0
        logger.info("Keyboard interrupt in thread %d, quitting\n"%(jobID))
        return result
   except Exception:
        result["status"] = 0
        logger.info("Other error in thread %d, quitting\n"%(jobID))
        return result

def run_logged_command(command, outputDir, label, **kwargs):
    with open(f"{outputDir}/log/{label}.out", 'w') as stdout_f, open(f"{outputDir}/log/{label}.err", 'w') as stderr_f:
        if shutil.which("time") is not None:
            command = "time " + command
        return run_command(command=command, stdout=stdout_f, stderr=stderr_f, **kwargs)

def run_command(command, ignorerc=0, stdout=subprocess.PIPE, stderr=subprocess.PIPE):
    global SIGINT
    logger.debug(command)
    result = subprocess.run(shlex.split(command), stderr=stderr, stdout=stdout)
    rc = result.returncode

    if stdout == subprocess.PIPE:
        fstdout = result.stdout.decode()
    else:
        fstdout = f"STDOUT was piped... please see output file:\t {stdout.name}"
    if stderr == subprocess.PIPE:
        fstderr = result.stderr.decode()
    else:
        fstderr = f"STDERR was piped... please see output file:\t {stderr.name}"
    if rc != 0 and not ignorerc:
      logger.critical("""The following command failed:
      >>$ {}
      Please veryify input data and restart Parsnp.
      If the problem persists please contact the Parsnp development team.

      STDOUT:
      {}

      STDERR:
      {}""".format(command, fstdout, fstderr))

      sys.exit(rc)
    else:
      logger.debug(fstdout)
      logger.debug(fstderr)
      logger.debug("")
    return rc


def is_valid_file_path(parser, arg):
    if not os.path.exists(arg) and arg != "!" and arg != None and arg != "":
        logger.critical("The file %s does not exist!" % arg)
    else:
        return arg


def is_valid_dir(parser, arg):
    if not os.path.exists(arg):
        parser.error( "The directory %s does not exist\n" % (arg))
    if len(glob("%s/*"%(arg))) == 0:
        parser.error("The director %s is empty"%(arg))


def parse_args():
    parser = argparse.ArgumentParser(description="""
    Parsnp quick start for three example scenarios:
    1) With reference & genbank file:
    python Parsnp.py -g <reference_genbank_file1 reference_genbank_file2 ...> -d <seq_file1 seq_file2 ...>  -p <threads>

    2) With reference but without genbank file:
    python Parsnp.py -r <reference_genome> -d <seq_file1 seq_file2 ...> -p <threads>

    3) Autorecruit reference to a draft assembly:
    python Parsnp.py -q <draft_assembly> -d <seq_file1 seq_file2 ...> -p <threads>
    """, formatter_class=argparse.RawTextHelpFormatter)
    #TODO Use lambda to check files and directories
    input_output_args = parser.add_argument_group(title="Input/Output")
    input_output_args.add_argument(
        "-c",
        "--curated",
        action = "store_true",
        help = "(c)urated genome directory, use all genomes in dir and ignore MUMi?")
    input_output_args.add_argument(
        "-d",
        "--sequences",
        type = str,
        nargs = '+',
        required = True,
        help = "A list of files containing genomes/contigs/scaffolds. If the file ends in .txt, each line in the file corresponds to the path to an input file.")
    input_output_args.add_argument(
        "-r",
        "--reference",
        type = lambda fname: is_valid_file_path(parser, fname),
        default = "",
        help = "(r)eference genome (set to ! to pick random one from sequence dir)")
    #TODO Accept as space-separated input and parse automatically w/ argparse
    input_output_args.add_argument(
        "-g",
        "--genbank",
        nargs = '+',
        help = "A list of Genbank file(s) (gbk)")
    input_output_args.add_argument(
        "-o",
        "--output-dir",
        type = str,
        default = "[P_CURRDATE_CURRTIME]")
    input_output_args.add_argument(
        "-q",
        "--query",
        type = str,
        help = "Specify (assembled) query genome to use, in addition to genomes found in genome dir")

    MUMi_args = parser.add_argument_group(title="MUMi")
    MUMi_mutex_args = MUMi_args.add_mutually_exclusive_group()
    #TODO whats the default?
    MUMi_mutex_args.add_argument(
        "-U",
        "--max-mumi-distr-dist",
        "--MUMi",
        type = float,
        default = 0.5,
        help = "Max MUMi distance value for MUMi distribution")
    #TODO Not parsed in current parsnp version and had a duplicate -i flag. Is this no longer used?
    MUMi_mutex_args.add_argument(
        "-mmd",
        "--max-mumi-distance",
        type = float,
        help = "Max MUMi distance (default: autocutoff based on distribution of MUMi values)")
    MUMi_args.add_argument(
        "-F",
        "--fastmum",
        action = "store_true",
        help = "Fast MUMi calculation")
    MUMi_args.add_argument(
        "-M",
        "--mumi_only",
        "--onlymumi",
        action = "store_true",
        help = "Calculate MUMi and exit? overrides all other choices!")

    MUMi_rec_prog = MUMi_args.add_mutually_exclusive_group()
    # MUMi_rec_prog.add_argument(
        # "--use-mummer-mumi",
        # action = "store_true",
        # help = "Use mummer for MUMi distance genome recruitment")
    MUMi_rec_prog.add_argument(
        "--use-ani",
        action = "store_true",
        help = "Use ani for genome recruitment")
    MUMi_args.add_argument(
        "--min-ani",
        type = float,
        default = 95,
        help = "Min ANI value to allow for genome recruitment.")
    MUMi_args.add_argument(
        "--min-ref-cov",
        type = float,
        default = 0,
        help = "Minimum percent of reference segments to be covered in FastANI")
    MUMi_rec_prog.add_argument(
        "--use-mash",
        action = "store_true",
        help = "Use mash for genome recruitment")
    MUMi_args.add_argument(
        "--max-mash-dist",
        type = float,
        default = .1,
        help = "Max mash distance.")

    MUM_search_args = parser.add_argument_group(title="MUM search")
    #new, default to lower, 12-17
    MUM_search_args.add_argument(
        "-a",
        "--min-anchor-length",
        "--anchorlength",
        type = str,
        default = "1.1*(Log(S))",
        help = "Min (a)NCHOR length (default = 1.1*(Log(S)))")
    MUM_search_args.add_argument(
        "-m",
        "--mum-length",
        "--mumlength",
        type = str,
        default = "1.1*(Log(S))",
        help = "Mum length")
    MUM_search_args.add_argument(
        "-C",
        "--max-cluster-d",
        "--clusterD",
        type = int,
        default = 300,
        help = "Maximal cluster D value")
    MUM_search_args.add_argument(
        "-z",
        "--min-cluster-size",
        "--minclustersize",
        type = int,
        default = 21,
        help = "Minimum cluster size")
    #TODO -z was a duplicate flag but no longer parsed as min-lcb-size in the current parsnp version
    # MUM_search_args.add_argument(
            # "-z",
            # "--min-lcb-size",
            # type = int,
            # default = 25,
            # help = "Min LCB si(z)e")

    LCB_alignment_args = parser.add_argument_group(title="LCB alignment")
    LCB_alignment_args.add_argument(
        "-D",
        "--max-diagonal-difference",
        "--DiagonalDiff",
        metavar = "MAX_DIAG_DIFF",
        type = str,
        default="0.12",
        help = "Maximal diagonal difference. Either percentage (e.g. 0.2) or bp (e.g. 100bp)")
    LCB_alignment_args.add_argument(
        "-n",
        "--alignment-program",
        "--alignmentprog",
        type = str,
        choices = list(ALIGNER_TO_IDX.keys()),
        default = "muscle",
        help = "Alignment program to use")
    LCB_alignment_args.add_argument(
        "-u",
        "--unaligned",
        action = "store_true",
        help = "Output unaligned regions")

    # recombination_args = parser.add_argument_group("Recombination filtration")
    #TODO -x was a duplicate flag but no longer parsed as filter-phipack-snps in the current parsnp version
    # recombination_args.add_argument(
            # "-x",
            # "--filter-phipack-snps",
            # action = "store_true",
            # help = "Enable filtering of SNPs located in PhiPack identified regions of recombination")

    partition_args = parser.add_argument_group("Sequence Partitioning")
    partition_args.add_argument(
        "--no-partition",
        action='store_true',
        help="Run all query genomes in single parsnp alignment, no partitioning.")
    partition_args.add_argument(
        "--min-partition-size",
        type=int,
        default=50,
        help="Minimum size of a partition. Input genomes will be split evenly across partitions at least this large.")

    extend_args = parser.add_argument_group("LCB Extension")
    extend_args.add_argument(
        "--extend-lcbs",
        action="store_true",
        help="Extend the boundaries of LCBs with an ungapped alignment")
    extend_args.add_argument(
        "--extend-ani-cutoff",
        type=float,
        default=0.95,
        help="Cutoff ANI for lcb extension")
    extend_args.add_argument(
        "--extend-indel-cutoff",
        type=int,
        default=50,
        help="Cutoff for indels in LCB extension region. LCB extension will be at most min(seqs) + cutoff bases")
    extend_args.add_argument(
        "--match-score",
        type=float,
        default=5,
        help="Value of match score for extension")
    extend_args.add_argument(
        "--mismatch-penalty",
        type=float,
        default=-4,
        help="Value of mismatch score for extension (should be negative)")
    extend_args.add_argument(
        "--gap-penalty",
        type=float,
        default=-2,
        help="Value of gap penalty for extension (should be negative)")

    misc_args = parser.add_argument_group("Misc")
    misc_args.add_argument(
        "--skip-phylogeny",
        action="store_true",
        help="Do not generate phylogeny from core SNPs")
    misc_args.add_argument(
        "--validate-input",
        action="store_true",
        help="Use Biopython to validate input files")
    misc_args.add_argument(
        "--use-fasttree",
        action = "store_true",
        help = "Use fasttree instead of RaxML")
    misc_args.add_argument(
        "--vcf",
        action = "store_true",
        help = "Generate VCF file.")
    misc_args.add_argument(
        "--no-maf",
        action = "store_true",
        help = "Do not generage MAF file (XMFA only)")
    misc_args.add_argument(
        "-p",
        "--threads",
        type = int,
        default = 1,
        help = "Number of threads to use")
    misc_args.add_argument(
        "-P",
        "--max-partition-size",
        type = int,
        default = 15000000,
        help = "Max partition size (limits memory usage)")
    misc_args.add_argument(
        "-v",
        "--verbose",
        action = "store_true",
        help = "Verbose output")
    misc_args.add_argument(
        "-x",
        "--xtrafast",
        action = "store_true")
    misc_args.add_argument(
        "-i",
        "--inifile",
        "--ini-file",
        type = str)
    misc_args.add_argument(
        "-e",
        "--extend",
        action = "store_true")
    misc_args.add_argument(
        "--no-recruit",
        action = "store_true")
    misc_args.add_argument(
        "-V",
        "--version",
        action = "version",
        version = "%(prog)s " + __version__)

    todo_args = parser.add_argument_group("Miscellaneous")
    # todo_args.add_argument(
        # "-l",
        # "--layout",
        # action = "store_true")
    # todo_args.add_argument(
        # "-s",
        # "--split",
        # action = "store_true",
        # help = "Split genomes by n's")
    return parser.parse_args()
####################################################################################################
#print("-g = <bool>: auto-launch (g)ingr? (default = NO)"

def parse_genbank_files(cmd_args, output_dir):
    '''
    Single method to do the genbank file parsing from the main control flow.

    This was extracted from a big blob of code in the old version that did a bunch of genbank parsing. Because the
    old code was sitting in the if __name__=='__main__': loop, rather than pass anything in as arguments the function
    starts with this big 'global' command, which has been refined somewhat. Currently 'genbank_file' is used in one
    spot later in the code it is still considered a global variable here.

    Args:
        cmd_args (namespace): command-line arguments returned from the parse_args() function.

    Returns:
        genbank_ref (str):  path to the genbank reference .fna file located in the 'tmp' subfolder of the output
                            directory.
    '''
    # global genbank_files, f, genbank_file, genbank_ref, rf, line, e
    global genbank_file
    genbank_ref = ""

    # 1) Make a list of the genbank files:
    genbank_files = cmd_args.genbank
    genbank_files_processed = []
    for genbank_f in genbank_files:
        if os.path.isdir(genbank_f):
            for f in os.listdir(genbank_f):
                f = os.path.join(genbank_f, f)
                if os.path.isfile(f):
                    genbank_files_processed.append(f)
        elif os.path.isfile(genbank_f):
            genbank_files_processed.append(genbank_f)
        else:
            logger.error("{} is not a valid file".format(genbank_f))
    genbank_files = genbank_files_processed

    # 2) If there aren't any files then exit the whole command:
    if len(genbank_files) == 0:
        logger.critical("No valid genbank files provided...")
        sys.exit(1)
    ctcmd = "cat "
    first = True
    # genbank_ref = ""

    # 3) Todd's original .gb file parser. Should probably be replaced by the biopython parser at some point.
    for genbank_file in genbank_files:
        if len(genbank_file) <= 1:
            continue
        ctcmd += genbank_file + " "
        genbank_ref = os.path.join(output_dir, "tmp", os.path.basename(genbank_file) + ".fna")
        try:
            # parse out reference, starts at ORIGIN ends at //, remove numbers,
            rf = open(genbank_file, 'r')
            genbank_ref_d = open(genbank_ref, "a+")
            while True:
                giline = rf.readline()
                if "VERSION" and "GI" in giline:
                    break
                elif giline == None or giline == "":
                    logger.critical("Genbank file %s malformatted \n" % (genbank_file))
                    sys.exit(1)
            if len(giline) <= 2:
                logger.critical("Genbank file %s malformatted \n" % (genbank_file))
                sys.exit(1)
            genbank_ref_d.write(">gi|" + giline.split("GI:")[-1])
            first = False
            ntdata = False
            data = ""
            for line in rf:
                if ntdata:
                    if "//" in line:
                        ntdata = False
                        break
                    data += line[9:].replace(" ", "")
                if "ORIGIN" in line:
                    ntdata = True

            rf.close()
            if len(data) < 10:
                logger.critical("Genbank file %s contains no sequence data\n" % (genbank_file))
                sys.exit(1)
            genbank_ref_d.write(data.upper())
            genbank_ref_d.close()
        except IOError as e:
            logger.critical("Genbank file %s not found\n" % (genbank_file))
            sys.exit(1)

    return genbank_ref, genbank_files


def check_for_dependencies(use_phipack, use_fasttree):
    '''
    Routine to check for the main dependencies for parsnp and exits the program if one is missing.

    That includes 
    1) Profile (if -x or --xtrafast), 
    2) Raxml (if !use_fasttree) else one of the main FastTree executables 
    3) harvesttools
    '''
    missing = False
    exe = "harvesttools"
    if shutil.which(exe) is None:
        missing = True
        logger.critical("{} not in system path!".format(exe))
    if use_phipack:
        exe = "Profile"
        if shutil.which(exe) is None:
            missing = True
            logger.critical("{} not in system path!".format(exe))

    if use_fasttree:
        has_fasttree = False
        # has to have AT LEAST one of these
        for exe in ["fasttree", "FastTree", "FastTreeMP"]:
            if shutil.which(exe) is not None:
                has_fasttree = True
        if not has_fasttree:
            logger.critical("No fasttree executable found in system path!".format(exe))
        missing = missing or (not has_fasttree)
    else:
        exe = "raxmlHPC-PTHREADS"
        if shutil.which(exe) is None:
            missing = True
            logger.critical("{} not in system path!".format(exe))
    if missing:
        sys.exit(1)


def create_output_directory(output_dir):
    '''
    Function to make the output directory and the 'tmp' subdirectory based on the command-line arguments.

    Args:
        output_dir (str): string representing the output folder specified at the command line.

    Returns:

    '''
    if output_dir == "." or output_dir == "./" or output_dir == "/":
        logger.critical("Specified output dir is current working dir or root dir! will clobber any parsnp.* results")
        sys.exit(1)

    if os.path.exists(output_dir):
        logger.warning(f"Output directory {output_dir} exists, all results will be overwritten")
        if os.path.exists(output_dir + "/partition"):
            shutil.rmtree(output_dir + "/partition/")
        if os.path.exists(output_dir + "/config/"):
            shutil.rmtree(output_dir + "/config/")
        if os.path.exists(output_dir + "/log/"):
            shutil.rmtree(output_dir + "/log/")
    elif output_dir == "[P_CURRDATE_CURRTIME]":
        today = datetime.datetime.now()
        timestamp = "P_" + today.isoformat().replace("-", "_").replace(".", "").replace(":", "").replace("T", "_")
        output_dir = os.getcwd() + os.sep + timestamp
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(os.path.join(output_dir, "tmp"), exist_ok=True)
    os.makedirs(os.path.join(output_dir, "log"), exist_ok=True)
    os.makedirs(os.path.join(output_dir, "config"), exist_ok=True)
    return output_dir


def process_input_files(cmd_args, input_files):
    '''
    Takes the string for the input files given at the command-line (i.e. the 'sequences' or '-d' argument)
    and checks it for the various options to be turned into a list of actual fasta files representing each
    of the input alignments.

    There are really three ways to provide the list of sequence files at the command line. The first is
    to just give the list of files as a space-delimited list following the '-d' flag, although this can be
    a real pain if the list is more than just a couple. The second is to put all the files in a single folder
    and just provide the path to the folder after the '-d' flag. The third is to provide a '.txt' file
    containing a line-separated list of file paths for the sequences.

    So this function takes the value of args.sequences (i.e., the argument 'input_files') and does some checks
    on it, keeping a list of final input file paths to use.
        1) If it is a valid directory, add every folder in it to the final file list.
        2) Otherwise, if it is a valid file and the filename ends in '.txt', add each line of the file
            to the final file list.
        3) Otherwise, just add 'input_files' itself to the final file list, assuming that if the user provides
            a space-separated list at the commadn line then that will be in the correct 'list' form to pass
            along without converting.

    Args:
        cmd_args:
        input_files:

    Returns:

    '''
    # global input_files
    input_files_processed = []
    for input_f in input_files:
        # if it is a folder, add all the files in it to the list...
        if os.path.isdir(input_f):
            for f in os.listdir(input_f):
                f = os.path.join(input_f, f)
                if os.path.isfile(f):
                    input_files_processed.append(f)
        # ...otherwise if it's a text file then do a bunch of stuff to it, if not just add it to the list.
        elif os.path.isfile(input_f):
            # We now have the option to pass a text file which line-separated list of files (or directories):
            if Path(input_f).suffix == ".txt":
                with open(input_f) as seq_list:
                    for line in seq_list:
                        input_f = line.strip()
                        if os.path.isdir(input_f):
                            for f in os.listdir(input_f):
                                f = os.path.join(input_f, f)
                                if os.path.isfile(f):
                                    input_files_processed.append(f)
                        else:
                            input_files_processed.append(input_f)
            else:
                input_files_processed.append(input_f)
        else:
            logger.error("{} is not a valid file".format(input_f))
    input_files = input_files_processed
    if len(input_files) < 2:
        logger.critical("Less than 2 input sequences provided...")
        sys.exit(1)
    # ...Step to validate the input if that argument is specified:
    if cmd_args.validate_input:
        bad_files = set()
        for f in input_files + ([ref] if ref and ref != "!" else []):
            try:
                records = list(SeqIO.parse(f, "fasta"))
            except:
                logger.error("{} is an invalid sequence file!".format(f))
                bad_files.add(f)
                continue
            if cmd_args.extend_lcbs and len(records) > 1:
                logger.error(f"Extending LCBs does not currently work with multi-contig inputs yet, {f} has multiple contigs.")
                sys.exit(1)
            for record in records:
                valid_chars = "GATCRYWSMKHBVDN"
                if any(c not in valid_chars + valid_chars.lower() for c in record.seq):
                    logger.error("Genome sequence {} has invalid characters {}! Skip!".format(f, set(str(
                        record.seq)) - set("AGCTNagctn")))
                    bad_files.add(f)
        input_files = list(set(input_files) - bad_files)

    return input_files

def parse_reference_file(ref):
    '''
    If a specific reference file is given (i.e. **NOT** a genbank reference file, one that is passe in with
    '-r' rather than '-g'), goes through and parses it.

    Args:
        ref:    (assigned directly from args in the main loop)

    Returns:

    '''
    # global multifasta, seq, e
    multifasta = False
    if ref and ref != "!":
        try:
            rf = open(ref, 'r')
            rfd = rf.read()
            refseqs = rfd.split(">")[1:]
            currpos = 0
            if len(refseqs) > 1:
                multifasta = True
                for seqnum, seq in enumerate(refseqs):
                    seq = seq.split('\n', 1)[1]
                    fastalen = len(seq) - seq.count('\n')
                    ref_seqs[currpos + fastalen] = seqnum
                    currpos += fastalen
            rf.close()
        except IOError as e:
            logger.critical(" Reference genome file %s not found\n" % (ref))
            sys.exit(1)

    return multifasta, ref_seqs


def make_genome_and_reference_output_strings(ref, genbank_files):
    '''
    Assembles the strings for the console output saying which genome input files and which
    reference file are being used for this run.

    Args:
        ref:            (assigned directly from args in the main loop)
        genbank_files:  (assigned directly from args in the main loop)

    Returns:

    '''
    # global sortem, ref_string, genome_string, ref
    sortem = True
    ref_string = ref
    genome_string = ""
    if len(input_files) > 1:
        genome_string = "\n\t"
        if len(input_files) > 4:
            genome_string += "\n\t".join(input_files[:2])
            genome_string += "\n\t...{} more file(s)...\n\t".format(len(input_files) - 4)
            genome_string += "\n\t".join(input_files[-2:])
        else:
            genome_string += "\n\t".join(input_files)
    else:
        genome_string = input_files[0]
    if len(ref) == 0 and len(genbank_ref) != 0:
        # we are parsing from genbank, set ref to genbank_ref && turn off sorting
        ref = genbank_ref
        if len(genbank_files) > 1:
            ref_string = "\n\t"
            if len(genbank_files) > 4:
                ref_string += "\n\t".join(genbank_files[:2])
                ref_string += "\n\t...{} more file(s)...\n\t".format(len(genbank_files) - 4)
                ref_string += "\n\t".join(genbank_files[-2:])
            else:
                ref_string = "\n\t".join(genbank_files)
        else:
            ref_string += genbank_files[0]

        sortem = False

    return sortem, ref_string, genome_string, ref


def readfa(fp):
    """
    Fasta parser taken from readfq
    """
    last = None # this is a buffer keeping the last unprocessed line
    while True: # mimic closure; is it a bad idea?
        if not last: # the first record or a record following a fastq
            for l in fp: # search for the start of the next record
                if l[0] == '>': # fasta header line
                    last = l[:-1] # save this line
                    break
        if not last: break
        name, seqs, last = last[1:].partition(" ")[0], [], None
        for l in fp: # read the sequence
            if l[0] in '>':
                last = l[:-1]
                break
            seqs.append(l[:-1])

        yield name, ''.join(seqs) # yield a fasta record
        if not last: break

def check_ref_genome_aligned(ref):
    reflen = 0
    with open(ref, 'r') as ff:
        for hdr, seq in readfa(ff):
            if '-' in seq:
                logger.warning(f"Reference genome sequence {hdr} in {ref} has '-' in the sequence!")
            reflen += len(seq)

    return reflen


def parse_input_files(input_files, curated, validate_input):
    '''
    Goes through and actually parses all the input files as fasta's and makes sure they don't raise any
    errors that way. Also calculates the total genome length and uses that to determine if any of them
    should be outright ignored because the length is too much different from the reference.

    Args:
        input_files (list of str):  list of file paths
        curated (bool):             if true, includes everything regardless of length (i.e., the input is
                                    "curated" already).

    Returns:

    '''
    # global ff, hdr, seq
    fnafiles = []
    fname_to_info = defaultdict(list)
    for input_file in input_files[:]:
        # If biopython cannot parse the input as a fasta, kick it out of the list and move on:
        if validate_input:
            try:
                record = list(SeqIO.parse(input_file, "fasta"))
                if len(record) == 0:
                    input_files.remove(input_file)
                    logger.error(f"{input_file} is an empty file!")
                    continue
            except:
                input_files.remove(input_file)
                logger.error(f"Could not parse {input_file}!")
                continue

        # Old version of the parser:
        with open(input_file, 'r') as ff:
            concat_seq = ""
            for hdr, seq in readfa(ff):
                concat_seq += seq

            seqlen = len(concat_seq)
            if '-' in concat_seq:
                logger.error("Genome sequence %s seems to be aligned! Skip!" % ((input_file)))
                continue
            elif seqlen <= 20:
                logger.error("File %s is less than or equal to 20bp in length. Skip!" % (input_file))
                continue
            sizediff = float(reflen) / seqlen

            # Argument for ignoring any issues with the input/references:
            if curated:
                log_f = logger.warning
                msg = ""
            else:
                log_f = logger.error
                msg = "Skipping..."
            if sizediff <= 0.8:
                log_f("File {} is {:.2f}x longer than reference! {}".format(
                    input_file, 1 / sizediff, msg))
                if not curated or args.no_recruit:
                    continue
            elif sizediff >= 1.2:
                log_f("File {} is {:.2f}x shorter than reference genome! {}".format(
                    input_file, sizediff, msg))
                if not curated or args.no_recruit:
                    continue
            fnafiles.append(input_file)
            fnaf_sizes[input_file] = seqlen

    return input_files, fnafiles, fnaf_sizes


def write_inifile_1(args, ref, aligner, xtrafast, output_dir, fastmum, reflen, fnafiles, threads):
    '''
    Writes the first .ini file. Starts with 'template.ini' in the parsnp folder and replaces various
    values in it, then writes the output to 'all_mumi.ini' in the output folder (which is eventually
    deleted at the end of the program).

    Most of these args are pulled directly from the command-line arguments, although a copy of that
    is passed in directly.

    Args:
        args (namespace):   should be a copy of the command-line 'args' variable from parse-args.
        ref:                (assigned directly from args in the main loop)
        aligner:            (assigned directly from args in the main loop)
        xtrafast:           (assigned directly from args in the main loop)
        output_dir:         (assigned directly from args in the main loop)
        fastmum:            (assigned directly from args in the main loop)
        reflen:             (assigned directly from args in the main loop)
        fnafiles:           (assigned directly from args in the main loop)
        threads:            (assigned directly from args in the main loop)

    Returns:

    '''
    # global extend, inifiled, file_string, cnt
    logger.debug("Writing .ini file")
    if xtrafast or 1:
        args.extend = False
    inifiled = open("%s/template.ini" % (PARSNP_DIR), 'r').read()
    inifiled = inifiled.replace("$REF", ref)
    inifiled = inifiled.replace("$EXTEND", "%d" % (args.extend))
    inifiled = inifiled.replace("$ANCHORS", str(args.min_anchor_length))
    inifiled = inifiled.replace("$MUMS", str(args.mum_length))
    inifiled = inifiled.replace("$MINCLUSTER", str(args.min_cluster_size))
    inifiled = inifiled.replace("$CLUSTERD", str(args.max_cluster_d))
    inifiled = inifiled.replace("$THREADS", str(threads))
    inifiled = inifiled.replace("$ALIGNER", str(aligner))
    inifiled = inifiled.replace("$DIAGDIFF", str(args.max_diagonal_difference))
    inifiled = inifiled.replace("$RECOMBFILT", "%d" % (xtrafast))
    inifiled = inifiled.replace("$OUTDIR", output_dir)
    if fastmum:
        inifiled = inifiled.replace("$PARTPOS", "%d" % (0.2 * reflen))
    else:
        inifiled = inifiled.replace("$PARTPOS", "%s" % (args.max_partition_size))
    file_string = ""
    for cnt, fna_file in enumerate(fnafiles, 1):
        file_string += "file%d=%s\n" % (cnt, fna_file)
        file_string += "reverse%d=0\n" % (cnt)
    inifiled_mumi = inifiled.replace("$FILES\n", file_string)
    inifiled_mumi = inifiled_mumi.replace("calcmumi=0", "calcmumi=1")
    inifile_mumi = open(os.path.join(output_dir, "config", "all_mumi.ini"), 'w')
    inifile_mumi.write(inifiled_mumi)
    inifile_mumi.close()

    return inifiled


def write_inifile_2(inifiled, outputDir, unaligned, args, auto_ref, query, finalfiles, ref, threads):
    '''
    Writes the second .ini file ('parsnpAligner.ini' and 'psnn.ini') using part of the previous one as a
    starting point.

    Args:
        inifiled:           Copy of the previous .ini file text string (output by write_inifile_1(...))
        outputDir:          (assigned directly from args in the main loop)
        unaligned:          (assigned directly from args in the main loop)
        args (namespace):   should be a copy of the command-line 'args' variable from parse-args.
        auto_ref:           (assigned directly from args in the main loop)
        query:              (assigned directly from args in the main loop)
        finalfiles:         copy of the final file set to align from other code in the main loop.
        ref:                (assigned directly from args in the main loop)
        threads:            Number of threads to use

    Returns:

    '''
    # global file_string, cnt, f, inifiled_closest, inifile
    # Rewrite output dir
    comp = re.compile(r"(.*)\noutdir=(.*)\n(.*)")
    inifiled = comp.sub(fr"\1\noutdir={outputDir}\n\3", inifiled)

    # Replace threads
    comp = re.compile(r"(.*)\ncores=(.*)\n(.*)")
    inifiled = comp.sub(fr"\1\ncores={threads}\n\3", inifiled)

    if len(finalfiles) < 1 or ref == "":
        logger.critical("Parsnp requires 2 or more genomes to run, exiting\n")
        sys.exit(1)
    file_string = ""
    cnt = 1
    file_string_closest = ""
    # TODO whats the point of iterating over one file?
    for cnt, f in enumerate(finalfiles[0:1], 1):
        file_string_closest += "file%d=%s\n" % (cnt, f)
        file_string_closest += "reverse%d=0\n" % (cnt)
    for cnt, f in enumerate(finalfiles, 1):
        file_string += "file%d=%s\n" % (cnt, f)
        file_string += "reverse%d=0\n" % (cnt)
    inifiled = inifiled.replace("$FILES\n", file_string)
    # new, output unaligned regions
    inifiled = inifiled.replace("$UNALIGNED", unaligned)
    inifiled_closest = inifiled.replace("$FILES\n", file_string_closest)
    if fastmum:
        inifiled = inifiled.replace("p=%d" % (0.2 * reflen), "p=%s" % (args.max_partition_size))
        inifiled_closest = inifiled.replace("p=%d" % (0.2 * reflen), "p=%s" % (args.max_partition_size))
    if autopick_ref:
        inifiled = inifiled.replace(orig_auto_ref, auto_ref)
        inifiled = inifiled.replace(auto_ref, "tmp_" + auto_ref)
        inifiled = inifiled.replace(query, auto_ref)
        inifiled = inifiled.replace("tmp_" + auto_ref, query)
        inifiled_closest = inifiled_closest.replace(auto_ref, "tmp_" + auto_ref)
        inifiled_closest = inifiled_closest.replace(query, auto_ref)
        inifiled_closest = inifiled_closest.replace("tmp_" + auto_ref, query)
    inifile = open(f"{outputDir}/parsnpAligner.ini", 'w')
    inifile.write(inifiled)
    inifile.close()
    # inifile_closest = open(f"{outputDir}/psnn.ini", 'w')
    # inifile_closest.write(inifiled_closest)
    # inifile_closest.close()


# Assumes outputDir/parsnpAligner.ini exists
def run_parsnp_aligner(outputDir):
    inifile = f"{outputDir}/parsnpAligner.ini"
    if not os.path.exists(inifile):
        logger.error("ini file %s does not exist!\n"%(inifile))
        sys.exit(1)
    command = "%s/bin/parsnp_core %s"%(PARSNP_DIR,inifile)
    # with open(f"{outputDir}/parsnpAligner.out", 'w') as stdout_f, open(f"{outputDir}/parsnpAligner.err", 'w') as stderr_f:
        # rc = run_command(command, ignorerc=1, stdout=stdout_f, stderr=stderr_f, prepend_time=True)
    rc = run_logged_command(command=command, ignorerc=1, label="parsnp-aligner", outputDir=outputDir)

    if not os.path.exists(os.path.join(outputDir, "parsnpAligner.xmfa")) or rc != 0:
        logger.critical("Set of recruited genomes are too divergent for parsnp, please reduce MUMi (%f) and relaunch\n"%(float(mumidistance)))
    else:
        shutil.move(
                os.path.join(outputDir, "parsnpAligner.xmfa"),
                os.path.join(outputDir, "parsnp.xmfa"))
    return rc


#   Most of the control flow for the whole Parnsp program runs through this script, which is probably not an
#   awesome design pattern.
if __name__ == "__main__":
    # 1) Initialize some variables and parse command-line args:
    t1 = time.time()
    logger.info(f"|--Parsnp {VERSION}--|\n")


    parsnp_dir= sys.path[0]
    #print parsnp_dir
    #PARSNP_DIR = parsnp_dir
    opts = []
    args = []

    OSTYPE, BINARY_TYPE = get_os()
    args = parse_args()
    currdir = os.getcwd()
    logging_level = logging.DEBUG if args.verbose else logging.INFO
    ref = args.reference
    randomly_selected_ref = False
    if ref == '!':
        randomly_selected_ref = True
    input_files = args.sequences
    query = args.query
    # anchor = args.min_anchor_length   # (using args directly)
    #TODO I'm guessing mummer_mumi was intended to be an option?
    # use_mummer_mumi = args.use_mummer_mumi
    use_mummer_mumi = False
    use_ani = args.use_ani
    use_mash = args.use_mash
    use_fasttree = args.use_fasttree
    use_parsnp_mumi = not (use_mash or use_mummer_mumi or use_ani)
    # mum = args.mum_length # (using args directly)
    # maxpartition = args.max_partition_size
    fastmum = args.fastmum
    # cluster = args.max_cluster_d # (using args directly)
    curated = args.curated
    aligner = ALIGNER_TO_IDX[args.alignment_program.lower()]
    threads = args.threads
    unaligned = "0" if not args.unaligned else "1"
    # mincluster = args.min_cluster_size # (using args directly)
    diagdiff = args.max_diagonal_difference
    # splitseq = args.split
    # extend = args.extend  # (using args directly)
    # layout = args.layout
    xtrafast = args.xtrafast
    inifile = args.inifile
    inifile_exists = args.inifile is not None
    mumi_only = args.mumi_only
    mumidistance = args.max_mumi_distr_dist
    max_mash_dist = args.max_mash_dist
    min_ani_cutoff = args.min_ani
    outputDir = args.output_dir
    genbank_file = ""
    genbank_files = []
    genbank_files_cat = ""
    genbank_ref = ""
    reflen = 0
    use_gingr = ""
    generate_vcf = args.vcf
    filtreps = False

    repfile = ""
    ref_seqs = {}

    logger.setLevel(logging_level)
    for handler in logger.handlers:
        handler.setLevel(logging_level)

    # 2) Check for dependencies
    check_for_dependencies(xtrafast, use_fasttree)

    # 3) Create output dir
    outputDir = create_output_directory(outputDir)

    # 4) Process all input files:
    #   -- 'input_files' here is args.sequences
    input_files = process_input_files(args, input_files)

    # Parse reference if necessary
    multifasta, ref_seqs = parse_reference_file(ref)

    # 5) Validate genbank files
    #TODO Make this a function
    # return genbank_ref
    if args.genbank:
        genbank_ref, genbank_files = parse_genbank_files(args, outputDir)

    # This part just makes the output string for the command output listing all the genomes:
    sortem, ref_string, genome_string, ref = make_genome_and_reference_output_strings(ref, genbank_files)

    # Selects the reference:
    autopick_ref = False
    if (not ref and not query) or not input_files:
        logger.critical("No seqs provided, yet required. exit!")
        sys.exit(0)  # TODO Should this exit value be 0?
    elif not ref and query:
        logger.warning("No reference genome specified, going to autopick from input as closest to %s\n"%(query))
        autopick_ref = True
        ref = query

    logger.info("""
{}
SETTINGS:
|-refgenome:\t{}
|-genomes:\t{}
|-aligner:\t{}
|-outdir:\t{}
|-OS:\t{}
|-threads:\t{}
{}
    """.format(
        (len(outputDir)+17)*"*",
        "autopick" if ref == '!' else ref_string,
        genome_string,
        args.alignment_program,
        outputDir,
        OSTYPE,
        threads,
        (len(outputDir)+17)*"*"))

    # If we requested more threads than available, give a warning.
    try:
        if len(os.sched_getaffinity(0)) < threads:
            logger.warning("You have asked to use more threads than you have available on your machine. This may lead to serious performance degredation with RAxML.")
    except AttributeError:
        if multiprocessing.cpu_count() < threads:
            logger.warning("You have asked to use more threads than you have available on your machine. This may lead to serious performance degredation with RAxML.")

    logger.info("<<Parsnp started>>")

    #1)read fasta files (contigs/scaffolds/finished/DBs/dirs)
    # logger.info("Reading Genbank file(s) for reference (.gbk) %s"%("\t".join(genbank_files)))
    if len(genbank_file) == 0:
        logger.info("No genbank file provided for reference annotations, skipping..")

    allfiles = []
    fnaf_sizes = {}
    reflen = 0

    if ref == "!":
        ref = random_seeded.choice(input_files)

    # Check if reference genome is aligned
    reflen = check_ref_genome_aligned(ref)

    # Uses biopython to parse the inputs (just checking it again)
    input_files, fnafiles, fnaf_sizes = parse_input_files(input_files, curated, args.validate_input)

    # if ref in fnafiles:
        # fnafiles.remove(ref)

    #sort reference by largest replicon to smallest
    if sortem and os.path.exists(ref) and not autopick_ref:
        with open(ref, 'r') as ff:
            seq_dict = {hdr: seq for hdr, seq in readfa(ff)}
        seqs_sorted_by_len = sorted(seq_dict.items(), key=lambda kv: -len(kv[1]))
        new_ref = os.path.join(outputDir, os.path.basename(ref)+".ref")
        with open(new_ref, 'w') as ffo:
            for hdr, seq in seqs_sorted_by_len:
                ffo.write(f">{hdr}\n")
                ffo.write(f"{seq}\n")
        ref = new_ref
    else:
        ref = genbank_ref

    # TODO stray comment: remove any query sequences 30% diff in length
    allfiles = [os.path.basename(ref)]
    #write INI file to pass to the first stage of parsnp:
    if not inifile_exists:
        inifiled = write_inifile_1(args, ref, aligner, xtrafast, outputDir, fastmum, reflen, fnafiles, threads)

    #2)get near neighbors (mumi distance)
    if os.path.exists(os.path.join(outputDir, "alltogether.fasta")):
        os.remove(os.path.join(outputDir, "alltogether.fasta"))
    if os.path.exists(os.path.join(outputDir, "blocks/b1")):
        ftrm = glob(os.path.join(outputDir, "blocks/b*"))
        for f in ftrm:
            shutil.rmtree(f)

    if len(fnafiles) < 1 or ref == "":
        logger.critical("Parsnp requires 2 or more genomes to run, exiting")
        logger.debug("Only files found are: {}\n{} ".format(fnafiles, ref))
        sys.exit(1)

    # This is where it will calculate the genome distance (mumi or ANI) to the reference and will kick out
    #   ones that are too far apart from the final output.
    mumi_dict = {}
    finalfiles = []
    auto_ref = ""
    if not curated:
        logger.info("Recruiting genomes...")
        if use_parsnp_mumi:
            if not inifile_exists:
                command = f"{PARSNP_DIR}/bin/parsnp_core {outputDir}/config/all_mumi.ini"
            else:
                # TODO why are we editing the suffix of a provided file?
                command = f"{PARSNP_DIR}/bin/parsnp_core {inifile.replace('.ini', '_mumi.ini')}"
            run_logged_command(command=command, outputDir=outputDir, label="parsnp-mumi")
            # Takes eeach sequence and computes its mumi distance to the reference
            try:
                mumif = open(os.path.join(outputDir,  "all.mumi"),'r')
                for line in mumif:
                    line = line.rstrip('\n')
                    idx, mi = line.split(":")
                    mumi_dict[int(idx)-1] = float(mi)
                shutil.move(f"{outputDir}/all.mumi", f"{outputDir}/config/all.mumi")
            except IOError as e:
                logger.error("MUMi file generation failed... use all?")
                for i, _ in enumerate(fnafiles):
                    mumi_dict[i] = 1
            lowest_mumi = 100

            if autopick_ref:
                for idx in list(mumi_dict.keys()):
                    #TODO is there a way to organize these via dict rather than list? Seems error prone
                    if mumi_dict[idx] < lowest_mumi:
                        auto_ref = fnafiles[idx]
                        ref = auto_ref
                        lowest_mumi = mumi_dict[idx]
            mumi_f = ""
            if mumi_only:
                mumi_f = open(os.path.join(outputDir, "config", "recruited_genomes.lst"),'w')


            sorted_x = sorted(iter(mumi_dict.items()), key=operator.itemgetter(1))
            mumivals = []
            for scnt, item in enumerate(sorted_x):
                if scnt > 100 or scnt >= len(sorted_x):
                    break
                if float(item[1]) < float(mumidistance):
                    mumivals.append(float(item[1]))
            minv = minv = numpy.percentile(mumivals, 0) if len(mumivals) > 0 else 1.0
            dvals = mumivals

            stdv = 0
            hpv = 0
            if len(dvals) > 0:
                stdv = numpy.std(dvals)
                hpv = minv + (3*stdv)

            for idx in mumi_dict.keys():
                if mumi_dict[idx] < (float(mumidistance)) or curated:
                    if fastmum and mumi_dict[idx] > hpv:
                        continue
                    #TODO if 1, why is this?
                    if 1 or auto_ref != fnafiles[idx]:
                        if mumi_only:
                            mumi_f.write(os.path.abspath(fnafiles[idx])+",%f"%(mumi_dict[idx])+"\n")
                        finalfiles.append(fnafiles[idx])
                        allfiles.append(fnafiles[idx])
            if mumi_only:
                mumi_f.close()
                exit(0)

        else:
            try:
                tmp_dir = f"{outputDir}/tmp"
                all_genomes_fname = os.path.join(tmp_dir, "genomes.lst")
                with open(all_genomes_fname, 'w') as all_genomes_f:
                    all_genomes_f.writelines((line + '\n' for line in fnafiles))
                if use_mash:
                    if randomly_selected_ref:
                        logger.warning("You are using a randomly selected genome to recruit genomes from your input with Mash. If input genomes vary greatly in size, results could be suboptimal. It is advised to select a reference genome for recruitment...")
                    command = " ".join([
                            "mash", "dist", "-t",
                            "-d", str(max_mash_dist),
                            "-p", str(threads),
                            ref,
                            "-l", all_genomes_fname])
                    run_logged_command(command, outputDir=outputDir, label="mash")
                    with open(f"{outputDir}/log/mash.out", 'r') as mash_stdout:
                        mash_out = list(mash_stdout.readlines())
                        finalfiles = [line.split('\t')[0] for line in mash_out[1:] if line != '' and len(line.split('\t')) > 1 and line.split('\t')[1].strip() != '']
                elif use_ani:
                    # Guarantee at least 100 segments in FastANI (unless it requires a fragment length < 500)
                    fraglen = min(
                        3000, 
                        max(500, reflen // 100))
                    if randomly_selected_ref:
                        logger.warning("You have not selected a reference and are using ANI recruitment. All-to-all FastANI will be performed in order to obtain the best reference. As this is O(N^2), it is advised to select a reference genome if you have many query sequences.")
                    command = " ".join([
                            "fastANI",
                            ("--rl" if randomly_selected_ref else "-r"), 
                            (all_genomes_fname if randomly_selected_ref else ref),
                            "--ql", all_genomes_fname,
                            "--fragLen", str(fraglen),
                            "-t", str(threads),
                            "-o", os.path.join(outputDir, "fastANI.tsv")])
                    run_logged_command(command=command, outputDir=outputDir, label="fastANI")
                    genome_to_genomes = defaultdict(list)
                    with open(os.path.join(outputDir, "fastANI.tsv")) as results:
                        for line in results:
                            # FastANI results file -> Query, Ref, ANI val, extra stuff,,,
                            line = line.split('\t')
                            if float(line[2]) >= min_ani_cutoff and (int(line[3]) / int(line[4])) * 100 >= args.min_ref_cov:
                                genome_to_genomes[line[1]].append(line[0])

                        ani_ref = max(genome_to_genomes, key=(lambda key: len(genome_to_genomes[key])))
                        if autopick_ref:
                            auto_ref = ani_ref
                        finalfiles = [f for f in fnafiles if f in genome_to_genomes[ani_ref]]

                # shutil.rmtree(tmp_dir)
            except subprocess.CalledProcessError as e:
                logger.critical(
                    "Recruitment failed with exception {}. More details may be found in the *.err output log".format(str(e)))
                # shutil.rmtree(tmp_dir)
            allfiles.extend(finalfiles)

    # If "curateD" then anything that was removed gets added back.
    if curated:
        for f in fnafiles:
            if f not in finalfiles:
                finalfiles.append(f)
            if f not in allfiles:
                allfiles.append(f)

    # More stuff to autopick the reference if needed:
    orig_auto_ref = auto_ref
    if os.path.exists(auto_ref) and autopick_ref:
        seq_dict = {}
        seq_len = {}
        with open(auto_ref, 'r') as ff:
            seq_dict = {hdr: seq for hdr, seq in readfa(ff)}

        seqs_sorted_by_len = sorted(seq_dict.items(), key=lambda kv: -len(kv[1]))
        auto_ref = os.path.join(outputDir, os.path.basename(auto_ref)+".ref")
        with open(ref, 'w') as ffo:
            for hdr, seq in seqs_sorted_by_len:
                ffo.write(f">{hdr}\n")
                ffo.write(f"{seq}\n")
        ref = auto_ref

    finalfiles = sorted(finalfiles)
    random_seeded.shuffle(finalfiles)
    totseqs = len(finalfiles) + 1 # Add one to include reference

    #initiate parallelPhiPack tasks
    run_recomb_filter = 0

    #3)run parsnp (cores, grid?)
    if args.no_partition or len(finalfiles) < 2*args.min_partition_size:
        if len(finalfiles) < 2*args.min_partition_size:
            logger.info(f"Too few genomes to run partitions of size >{args.min_partition_size}. Running all genomes at once.")
        # Editing the ini file to be used for parnsp-aligner (which is different from parsnip as a mumi finder)
        if not inifile_exists:
            write_inifile_2(inifiled, outputDir, unaligned, args, auto_ref, query, finalfiles, ref, args.threads)

        logger.info("Running Parsnp multi-MUM search and libMUSCLE aligner...")
        run_parsnp_aligner(outputDir)
        shutil.move(f"{outputDir}/parsnpAligner.log", f"{outputDir}/log/parsnpAligner.log")

        blocks_dir = os.path.join(outputDir, "blocks")
        if not os.path.exists(blocks_dir):
            os.mkdir(blocks_dir)

        #get coverage
        coverage = 0
        totlength = 0
        try:
            cf = open(os.path.join(outputDir, "log", "parsnpAligner.log"))
            for line in cf:
                if "Total coverage among all sequences:" in line:
                    coverage = line.split(":",1)[-1].replace("\n","")
                    coverage = float(coverage.replace("%",""))/100.0
                elif "Length:" in line:
                    totlength += int(line.split(":",1)[-1].replace("\n","").split("bps")[0])
        except IOError:
            logger.critical("ParsnpAligner.log missing, parsnpAligner failed.")
            sys.exit(1)

        #update thresholds
        if coverage < 0.1:
            if coverage <= 0.01:
                logger.critical("""Aligned regions cover less than 1% of reference genome, something is not right
    Adjust params and rerun. If issue persists please submit a GitHub issue""")
                sys.exit(1)
            else:
                logger.warning("""Aligned regions cover less than 10% of reference genome!
    Please verify recruited genomes are all strain of interest""")
        else:
            pass
        #print("-->Getting list of LCBs.."
        allbfiles = glob(os.path.join(blocks_dir, "b*/*"))
        blockfiles = []
        block_startpos = []
        block_dict = {}
        for f in allbfiles:
            if os.path.isfile(f):
                if "seq.fna" in f:
                    blockfiles.append(f)
                    lf = open(f, 'r')
                    header = lf.readline()
                    if header[0] != ">":
                        logger.error("Error with LCB: %s\n"%(f))
                        continue

                    inf = header.split("+",1)[0]

                    rseq = ""
                    while 1:
                        lff = lf.readline()
                        if lff[0] == ">":
                            break
                        rseq += lff.replace("\n","")

                    spos,epos = inf.split(":",1)[-1].split("-",1)
                    block_startpos.append(int(spos))
                    block_dict[f] = [int(spos),int(epos), rseq]
                    lf.close()


        if xtrafast:
            run_recomb_filter = 1
        else:
            run_recomb_filter = 0

        recombination_sites = {}
        bedfile = ""
        bedfile_dict = {}
        if run_recomb_filter and len(blockfiles) > 0:
            logger.info("Running PhiPack on LCBs to detect recombination...")
            bedfile = open(os.path.join(outputDir, "parsnp.rec"), 'w')
            tasks = []
            processed = []
            for icnt, f in enumerate(blockfiles):
                seq1 = ""
                try:
                    bf = open(f, 'r')
                    seq1 = bf.read().split(">")[1].split("\n",1)[-1]
                    seq1 = seq1.replace("\n","")
                    bf.close()
                except IOError:
                    pass

                processed.append(f)
                params = {}
                path, f = f.rsplit(os.path.sep,1)
                params["jobID"] = len(tasks)
                params["query"] = f
                params["seqlen"] = len(seq1)
                params["spos"] = block_startpos[icnt]
                params["dir"] = path
                params["output"] = os.path.join(path, "Profile.csv")
                tasks.append(params)

            #run parallelPhiPack
            pool = Pool(processes=int(threads))
            result = pool.map_async(parallelPhiWrapper,tasks).get()

            for i in result:
                if (i["status"] == 1):
                    #process output
                    recregions = ""
                    block_spos = tasks[i["jobID"]]["spos"]
                    try:
                        recregions = open(tasks[i["jobID"]]["output"],'r').read()
                    except IOError:
                        logger.error("File %s doesn't exist, no rec regions or error in PhiPack\n"%(tasks[i["jobID"]]["output"]))
                        continue
                    reclines = recregions.split("\n")
                    prevpos = 0

                    for line in reclines:
                        try:
                            pos,eval = line.split(",")
                        except ValueError:
                            continue
                        pos = int(pos)
                        eval = float("%.5f"%(float(eval)))
                        if eval < 0.01 and eval >= 0:
                            idx = 0
                            srpos = 0
                            if pos-50 > 0:
                                srpos = (pos-50)+block_spos
                            else:
                                srpos = block_spos
                            eval = abs(eval)
                            if not multifasta:
                                bedfile_dict[srpos] = "1\t%s\t%s\tREC\t%.3f\t+\n"%(srpos,pos+50+block_spos,eval)
                            else:
                                chrnum = 1
                                chr_spos = list(ref_seqs.keys())
                                for cs in ref_seqs:
                                    if block_spos < len(chr_spos):
                                        chrnum = ref_seqs[cs]
                                bedfile_dict[srpos] = "%d\t%s\t%s\tREC\t%.3f\t+\n"%(chrnum,srpos,pos+50+block_spos,eval)

                    qfile = tasks[i["jobID"]]["query"]

                elif i["status"] != 2:
                    logger.critical("Parallel phipack job %d failed\n"%(i["jobID"]))
                    raise IOError

            pool.close()
            pool.join()
            brkeys = list(bedfile_dict.keys())
            brkeys.sort()
            for key in brkeys:
                bedfile.write(bedfile_dict[key])
            bedfile.close()

    else:
        import partition 
        full_partitions = len(finalfiles) // args.min_partition_size
        effective_partition_size = len(finalfiles) // full_partitions
        logger.info(f"Setting the partition size to {effective_partition_size}")
        partition_output_dir = f"{outputDir}/partition"
        partition_list_dir = f"{partition_output_dir}/input-lists"
        os.makedirs(partition_list_dir, exist_ok=True)
        for partition_idx in range(math.ceil(len(finalfiles) / effective_partition_size)):
            with open(f"{partition_list_dir}/{partition.CHUNK_PREFIX}-{partition_idx:010}.txt", 'w') as part_out:
                for qf in finalfiles[partition_idx*effective_partition_size : (partition_idx+1)*effective_partition_size]:
                    part_out.write(f"{qf}\n")

        chunk_label_parser = re.compile(f'{partition.CHUNK_PREFIX}-(.*).txt')
        chunk_labels = []
        for partition_list_file in os.listdir(partition_list_dir):
            chunk_labels.append(chunk_label_parser.search(partition_list_file).groups()[0])

        chunk_output_dirs = []
        for cl in chunk_labels:
            chunk_output_dir = f"{partition_output_dir}/{partition.CHUNK_PREFIX}-{cl}-out"
            # os.makedirs(chunk_output_dir, exist_ok=True)
            create_output_directory(chunk_output_dir)
            chunk_output_dirs.append(chunk_output_dir)
            with open(f"{partition_list_dir}/{partition.CHUNK_PREFIX}-{cl}.txt", 'r') as partition_file:
                partition_seq_files = [f for f in partition_file.read().splitlines()]

            write_inifile_2(inifiled, chunk_output_dir, unaligned, args, auto_ref, query, partition_seq_files, ref, max(1, args.threads // len(chunk_labels)))

        logger.info("Running partitions...")
        good_chunks = set(chunk_labels)
        with Pool(args.threads) as pool:
            return_codes = tqdm(
                pool.imap(run_parsnp_aligner, chunk_output_dirs, chunksize=1), 
                total=len(chunk_output_dirs), 
                file=TqdmToLogger(logger,level=logging.INFO),
                mininterval=MIN_TQDM_INTERVAL)
            for cl, rc in zip(chunk_labels, return_codes):
                if rc != 0:
                    logger.error(f"Partition {cl} failed...")
                    good_chunks.remove(cl)
            
        chunk_labels = sorted(list(good_chunks))

        logger.info("Computing intersection of all partition LCBs...")
        chunk_to_intvervaldict = partition.get_chunked_intervals(partition_output_dir, chunk_labels)
        intersected_interval_dict = partition.get_intersected_intervals(chunk_to_intvervaldict)
        
        logger.info("Trimming partitioned XMFAs back to intersected intervals...")
        num_clusters = partition.trim_xmfas(partition_output_dir, chunk_labels, intersected_interval_dict, args.threads)

        xmfa_out_f =  f"{outputDir}/parsnp.xmfa"

        logger.info(f"Merging trimmed XMFA files into a single XMFA at {xmfa_out_f}")
        partition.merge_xmfas(partition_output_dir, chunk_labels, xmfa_out_f, num_clusters, args.threads)


    parsnp_output = f"{outputDir}/parsnp.xmfa"

    # This is the stuff for LCB extension:
    if args.extend_lcbs:
        logger.warning("The LCB extension module is experimental. Runtime may be significantly increased and extended alignments may not be as high quality as the original core-genome. Extensions off of existing LCBs are in a separate xmfa file.")
        import partition 
        import extend as ext

        orig_parsnp_xmfa = parsnp_output
        extended_parsnp_xmfa = orig_parsnp_xmfa + ".extended"

        # Index input fasta files and original xmfa 
        index_to_gid, gid_to_index = ext.parse_xmfa_header(orig_parsnp_xmfa)
        gid_to_records, gid_to_cid_to_index, gid_to_index_to_cid = ext.index_input_sequences(orig_parsnp_xmfa, finalfiles + [ref])

        # Get covered regions of xmfa file
        idpair_to_segments, cluster_to_named_segments = ext.xmfa_to_covered(orig_parsnp_xmfa, index_to_gid, gid_to_index_to_cid)

        # Extend clusters
        logger.info(f"Extending LCBs with SPOA...")
        new_lcbs = ext.extend_clusters(orig_parsnp_xmfa, gid_to_index, gid_to_cid_to_index, idpair_to_segments, cluster_to_named_segments, gid_to_records)

        # Write output
        partition.copy_header(orig_parsnp_xmfa, extended_parsnp_xmfa)
        with open(extended_parsnp_xmfa, 'a') as out_f:
            for lcb in new_lcbs:
                partition.write_aln_to_xmfa(lcb, out_f)


    # Generate MAF file
    if not args.no_maf:
        logger.debug(f"Writing MAF file to {Path(parsnp_output).with_suffix('.maf')}")
        xmfa_to_maf(parsnp_output, Path(parsnp_output).with_suffix(".maf"), finalfiles + [ref])

    #add genbank here, if present
    if len(genbank_ref) != 0:
        rnc = f"harvesttools -q -o {outputDir}/parsnp.ggr -x {parsnp_output}"
        for file in genbank_files:
            rnc += " -g %s " %(file)
        command = rnc
    else:
        command = f"harvesttools -q -o {outputDir}/parsnp.ggr -f {ref} -x {parsnp_output}"

    # Harvest seems to fail sometimes when piping to stderr/stdout...
    run_command(command)

    if run_recomb_filter:
        command = "harvesttools -q -b %s/parsnp.rec,REC,\"PhiPack\" -o %s/parsnp.ggr -i %s/parsnp.ggr"%(outputDir,outputDir,outputDir)
        run_command(command)

    run_logged_command(
        f"harvesttools -i {outputDir}/parsnp.ggr -S {outputDir}/parsnp.snps.mblocks",
        outputDir=outputDir,
        label="harvest-mblocks")

    if generate_vcf:
        run_logged_command(
            f"harvesttools -q -x {parsnp_output} -V {outputDir+os.sep}parsnp.vcf", 
            outputDir, label="harvest-vcf")

    if not args.skip_phylogeny:
        logger.info("Reconstructing core genome phylogeny...")
        mblocks_seqs = SeqIO.parse(os.path.join(outputDir, "parsnp.snps.mblocks"), "fasta")
        for seq in mblocks_seqs:
            if len(seq) < 6:
                logger.warning("Not enough SNPs to use RaxML. Attempting to use FastTree instead...")
                use_fasttree = True
                break
        if not use_fasttree:
            with TemporaryDirectory() as raxml_output_dir:
                command = "raxmlHPC-PTHREADS -m GTRCAT -p 12345 -T %d -s %s -w %s -n OUTPUT"%(threads,outputDir+os.sep+"parsnp.snps.mblocks", raxml_output_dir)
                run_logged_command(command, outputDir=outputDir, label="raxml")
                shutil.move(f"{raxml_output_dir}/RAxML_bestTree.OUTPUT", outputDir + os.sep + "parsnp.tree")

        else:
            if shutil.which("FastTreeMP") is not None:
                os.environ["OMP_NUM_THREADS"] = str(threads)
                command = "FastTreeMP -nt -quote -gamma -slow -boot 100 "+outputDir+os.sep+"parsnp.snps.mblocks"
            else:
                logger.info("FastTreeMP not found. Trying fasttree...")
                command = "fasttree -nt -quote -gamma -slow -boot 100 "+outputDir+os.sep+"parsnp.snps.mblocks"

            run_logged_command(command, outputDir=outputDir, label="fasttree")
            shutil.move(f"{outputDir}/log/fasttree.out", f"{outputDir}/parsnp.tree")


        #7)reroot to midpoint
        if os.path.exists("outtree"):
             os.remove("outtree")

        if reroot_tree and len(finalfiles) > 1:
            try:
                mtree = open("%sparsnp.tree"%(outputDir+os.sep), 'r')
                mtreedata = mtree.read()
                mtreedata = mtreedata.replace("\n","")
                tree = dendropy.Tree.get_from_string(mtreedata,"newick")
                tree.reroot_at_midpoint(update_bipartitions=False)
                mftreef = tree.as_string('newick').split(" ",1)[1]
                #print mftreef
                mtreef = open(outputDir+os.sep+"parsnp.final.tree",'w')
                mtreef.write(mftreef)
                mtreef.close()
                os.system("mv %s %s"%(outputDir+os.sep+"parsnp.final.tree",outputDir+os.sep+"parsnp.tree"))
            except IOError:
                logger.error("Cannot process {} output, skipping midpoint reroot..\n".format("fasttree" if args.use_fasttree else "RaxML"))


        if len(use_gingr) > 0:
            logger.info("Creating Gingr input file..")
            if xtrafast or 1:
                #if newick available, add
                #new flag to update branch lengths
                run_command("harvesttools --midpoint-reroot -u -q -i "+outputDir+os.sep+"parsnp.ggr -o "+outputDir+os.sep+"parsnp.ggr -n %s"%(outputDir+os.sep+"parsnp.tree "))


    t2 = time.time()
    elapsed = float(t2)-float(t1)
    if float(elapsed)/float(60.0) > 60:
        logger.info("Aligned %d genomes in %.2f hours"%(totseqs,float(elapsed)/float(3600.0)))
    elif float(elapsed) > 60:
        #TODO just format the time to get rid of the above formatting
        logger.info("Aligned %d genomes in %.2f minutes"%(totseqs,float(elapsed)/float(60.0)))
    else:
        logger.info("Aligned %d genomes in %.2f seconds"%(totseqs,float(elapsed)))
    #cleanup
    rmfiles = glob(os.path.join(outputDir, "*.aln"))
    #rmfiles2 = glob.glob(outputDir+os.sep+"blocks/b*/*")
    rmfiles3 = glob(os.path.join(outputDir, "blocks/b*"))
    for f in rmfiles:
        os.remove(f)
    for f in rmfiles3:
        shutil.rmtree(f)

    filepres = 0
    logger.info("Parsnp finished! All output available in %s"%(outputDir))
    logger.debug("Validating output directory contents")
    if args.skip_phylogeny or os.path.exists("%sparsnp.tree"%(outputDir+os.sep)) and os.path.getsize("%sparsnp.tree"%(outputDir+os.sep)) > 0:
        filepres+=1
    else:
        logger.error("parsnp.tree:\t\tnewick format tree is missing!")
    if os.path.exists("%sparsnp.ggr"%(outputDir+os.sep)) and os.path.getsize("%sparsnp.ggr"%(outputDir+os.sep)) > 0:
        filepres+=1
    else:
        logger.error("parsnp.ggr:\t\tharvest input file for gingr (GUI) is missing!")
    if os.path.exists("%sparsnp.xmfa"%(outputDir+os.sep)) and os.path.getsize("%sparsnp.xmfa"%(outputDir+os.sep)) > 0:
        filepres+=1
    else:
        logger.error("parsnp.xmfa:\t\tXMFA formatted multi-alignment is missing")
    # if filepres != 3:
        # logger.critical("Output files missing, something went wrong. Check logs and relaunch or contact developers for assistance")

    if os.path.exists("%sblocks"%(outputDir+os.sep)):
        os.rmdir("%sblocks"%(outputDir+os.sep))
    if os.path.exists("allmums.out"):
        os.remove("allmums.out")

    if not VERBOSE and os.path.exists("parsnpAligner.ini"):
        os.remove("parsnpAligner.ini")

    prefix = os.path.join(outputDir, os.path.splitext(os.path.basename(ref))[0])
    if not VERBOSE and os.path.exists("%s.coords"%(prefix)):
        os.remove("%s.coords"%(prefix))

    if not VERBOSE and os.path.exists("%s.delta"%(prefix)):
        os.remove("%s.delta"%(prefix))

    for f in glob(os.path.join(outputDir,"*.reps")):
        if not VERBOSE and os.path.exists(f):
            os.remove(f)

    # for f in glob(os.path.join(outputDir, "*.ref")):
        # if not VERBOSE and os.path.exists(f):
            # os.remove(f)

    if not VERBOSE and os.path.exists("%s/psnn.ini"%(outputDir)):
        os.remove("%s/psnn.ini"%(outputDir))

    if not VERBOSE and os.path.exists("%s/all_mumi.ini"%(outputDir)):
        os.remove("%s/all_mumi.ini"%(outputDir))

    if not VERBOSE and os.path.exists("%s/tmp"%(outputDir)):
        shutil.rmtree("%s/tmp"%(outputDir))

    # if os.path.exists("%s/parsnp.snps.mblocks"%(outputDir)):
        # os.remove("%s/parsnp.snps.mblocks"%(outputDir))
    if os.path.exists("%s/parsnp.snps.mblocks.reduced"%(outputDir)):
        os.remove("%s/parsnp.snps.mblocks.reduced"%(outputDir))

    if not VERBOSE and os.path.exists("%s/all.mumi"%(outputDir)):
        os.remove("%s/all.mumi"%(outputDir))

    if os.path.exists(use_gingr):
        #check if available first
        rc = 0
        if binary_type == "osx":
            logger.info("Launching gingr..")
            os.system("open -n %s --args %s/parsnp.ggr"%(use_gingr,outputDir))

