Source code for pyGenClean.DupSNPs.duplicated_snps

#!/usr/bin/env python2.7

# This file is part of pyGenClean.
#
# pyGenClean is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# pyGenClean is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# pyGenClean.  If not, see <http://www.gnu.org/licenses/>.


import os
import sys
import math
import shutil
import random
import logging
import StringIO
import argparse
import subprocess
from collections import defaultdict

import numpy as np

from .. import __version__
from ..PlinkUtils import createRowFromPlinkSpacedOutput


logger = logging.getLogger("duplicated_snps")


[docs]def main(argString=None):
    """The main function of the module..

    Here are the steps for duplicated samples:

    1.  Prints the options.
    2.  Reads the ``map`` file to gather marker's position
        (:py:func:`readMAP`).
    3.  Reads the ``tfam`` file (:py:func:`readTFAM`).
    4.  Finds the unique markers using the ``map`` file
        (:py:func:`findUniques`).
    5.  Process the ``tped`` file. Write a file containing unique markers in
        ``prefix.unique_snps`` (``tfam`` and ``tped``). Keep in memory
        information about the duplicated markers (``tped``)
        (:py:func:`processTPED`).
    6.  If there are no duplicated markers, the file ``prefix.unique_snps``
        (``tped`` and ``tfam``) are copied to ``prefix.final``.
    7.  If there are duplicated markers, print a ``tped`` and ``tfam`` file
        containing the duplicated markers
        (:py:func:`printDuplicatedTPEDandTFAM`).
    8.  Computes the frequency of the duplicated markers (using Plink) and read
        the output file (:py:func:`computeFrequency`).
    9.  Computes the concordance and pairwise completion of each of the
        duplicated markers (:py:func:`computeStatistics`).
    10. Prints the problematic duplicated markers with a file containing the
        summary of the statistics (completion and pairwise concordance)
        (:py:func:`printProblems`).
    11. Print the pairwise concordance in a file (matrices)
        (:py:func:`printConcordance`).
    12. Choose the best duplicated markers using concordance and completion
        (:py:func:`chooseBestSnps`).
    13. Completes the chosen markers with the remaining duplicated markers
        (:py:func:`createAndCleanTPED`).
    14. Creates the final ``tped`` file, containing the unique markers, the
        chosen duplicated markers that were completed, and the problematic
        duplicated markers (for further analysis). This set excludes markers
        that were used for completing the chosen ones
        (:py:func:`createFinalTPEDandTFAM`).

    """
    # Getting and checking the options
    args = parseArgs(argString)
    checkArgs(args)

    logger.info("Options used:")
    for key, value in vars(args).iteritems():
        logger.info("  --{} {}".format(key.replace("_", "-"), value))

    # Reading the map file
    logger.info("Reading MAP file")
    mapF = readMAP(args.tfile + ".map", args.out)

    # Reading the tfam file
    logger.info("Reading TFAM file")
    tfam = readTFAM(args.tfile + ".tfam")

    # Find unique snps
    logger.info("Finding unique SNPs")
    uniqueSNPs = findUniques(mapF)

    # Process the TPED file
    logger.info("Reading TPED file")
    tped, duplicatedSNPs = processTPED(uniqueSNPs, mapF,
                                       args.tfile + ".tped",
                                       args.tfile + ".tfam", args.out)

    if len(tped) == 0:
        logger.info("There are no duplicated SNPs")
        logger.info("  - Creating final TFAM")
        # Copying the files
        # The TFAM
        try:
            shutil.copy(args.out + ".unique_snps.tfam",
                        args.out + ".final.tfam")
        except IOError:
            msg = "%s.unique_snps.tfam: can't copy file to " \
                  "%s.final.tfam" % (args.out, args.out)
            raise ProgramError(msg)

        # The TPED
        logger.info("  - Creating final TPED")
        try:
            shutil.copy(args.out + ".unique_snps.tped",
                        args.out + ".final.tped")
        except IOError:
            msg = "%s.unique_snps.tped: can't copy file to " \
                  "%s.final.tped" % (args.out, args.out)
            raise ProgramError(msg)

    else:
        # We print the TPED and TFAM for the duplicated SNPs
        logger.info("Printing duplicated SNPs TPED and TFAM files")
        printDuplicatedTPEDandTFAM(tped, args.tfile + ".tfam", args.out)

        # Computing the frequency of the duplicated SNPs
        logger.info("Computing duplicated SNPs' frequencies")
        dupSNPsFreq = computeFrequency(args.out + ".duplicated_snps",
                                       args.out)

        # Compute statistics
        logger.info("Computing concordance and completion of duplicated SNPs")
        completion, concordance = computeStatistics(tped, tfam, duplicatedSNPs)

        # Print the statistics
        logger.info("Printing duplicated SNPs summary file and finding errors "
                    "within duplicates")
        snpsToComplete = printProblems(completion, concordance, tped,
                                       duplicatedSNPs, dupSNPsFreq, args.out,
                                       args.frequency_difference)

        # Print the concordance file
        logger.info("Printing concordance file")
        printConcordance(concordance, args.out, tped, duplicatedSNPs)

        # Choose the best SNP
        logger.info("Choosing best SNP for each duplicates")
        chosenSNPs, comp, conc = chooseBestSnps(tped, duplicatedSNPs,
                                                completion, concordance,
                                                args.out)

        # Complete the SNPs
        logger.info("Completing chosen duplicates (removing discordant "
                    "genotypes)")
        newTPED, snpToRemove = createAndCleanTPED(
            tped,
            tfam,
            duplicatedSNPs,
            args.out,
            chosenSNPs,
            comp,
            conc,
            snpsToComplete,
            args.tfile + ".tfam",
            args.snp_completion_threshold,
            args.snp_concordance_threshold,
        )

        # Creates the final tped
        logger.info("Writing final TPED and TFAM file")
        createFinalTPEDandTFAM(newTPED, args.out + ".unique_snps",
                               args.out, snpToRemove)


[docs]def createFinalTPEDandTFAM(tped, toReadPrefix, prefix, snpToRemove):
    """Creates the final TPED and TFAM.

    :param tped: a representation of the ``tped`` of duplicated markers.
    :param toReadPrefix: the prefix of the unique files.
    :param prefix: the prefix of the output files.
    :param snpToRemove: the markers to remove.

    :type tped: numpy.array
    :type toReadPrefix: str
    :type prefix: str
    :type snpToRemove: set

    Starts by copying the unique markers' ``tfam`` file to
    ``prefix.final.tfam``. Then, it copies the unique markers' ``tped`` file,
    in which the chosen markers will be appended.

    The final data set will include the unique markers, the chosen markers
    which were completed, and the problematic duplicated markers (for further
    analysis). The markers that were used to complete the chosen ones are not
    present in the final data set.

    """
    # First, copying the tfam
    try:
        shutil.copy(toReadPrefix + ".tfam", prefix + ".final.tfam")
    except IOError:
        msg = "%(toReadPrefix)s.tfam: can't copy file to " \
              "%(prefix)s.final.tfam" % locals()
        raise ProgramError(msg)

    # Next, copy the tped, and append at the end
    try:
        shutil.copy(toReadPrefix + ".tped", prefix + ".final.tped")
    except IOError:
        msg = "%(toReadPrefix)s.tped: can't copy fil to " \
              "%(prefix)s.final.tped" % locals()
        raise ProgramError(msg)
    tpedFile = None
    try:
        tpedFile = open(prefix + ".final.tped", "a")
    except IOError:
        msg = "%(prefix)s.final.tped: can't append to file" % locals()
        raise ProgramError(msg)
    for i, row in enumerate(tped):
        if i not in snpToRemove:
            print >>tpedFile, "\t".join(row)
    tpedFile.close()


[docs]def createAndCleanTPED(tped, tfam, snps, prefix, chosenSNPs, completion,
                       concordance, snpsToComplete, tfamFileName, completionT,
                       concordanceT):
    """Complete a TPED for duplicated SNPs.

    :param tped: a representation of the ``tped`` of duplicated markers.
    :param tfam: a representation of the ``tfam``.
    :param snps: the position of duplicated markers in the ``tped``.
    :param prefix: the prefix of the output files.
    :param chosenSNPs: the markers that were chosen for completion (including
                       problems).
    :param completion: the completion of each of the duplicated markers.
    :param concordance: the pairwise concordance of the duplicated markers.
    :param snpsToComplete: the markers that will be completed (excluding
                           problems).
    :param tfamFileName: the name of the original ``tfam`` file.
    :param completionT: the completion threshold.
    :param concordanceT: the concordance threshold.

    :type tped: numpy.array
    :type tfam: list
    :type snps: dict
    :type prefix: str
    :type chosenSNPs: dict
    :type completion: numpy.array
    :type concordance: dict
    :type snpsToComplete: set
    :type tfamFileName: str
    :type completionT: float
    :type concordanceT: float

    :returns: a tuple containing the new ``tped`` after completion
              (:py:class:`numpy.array` as the first element, and the index of
              the markers that will need to be rid of (:py:class:`set`) as the
              last element.

    It creates three different files:

    * ``prefix.zeroed_out``: contains information about markers and samples
                             where the genotyped was zeroed out.
    * ``prefix.not_good_enough``: contains information about markers that were
                                  not good enough to help in completing the
                                  chosen markers (because of concordance or
                                  completion).
    * ``prefix.removed_duplicates``: the list of markers that where used for
                                     completing the chosen one, hence they will
                                     be removed from the final data
                                     set.

    Cycling through every genotypes of every samples of every duplicated
    markers, checks if the genotypes are all the same. If the chosen one was
    not called, but the other ones were, then we complete the chosen one with
    the genotypes for the others (assuming that they are all the same). If
    there is a difference between the genotypes, it is zeroed out for the
    chosen marker.

    """
    zeroedOutFile = None
    try:
        zeroedOutFile = open(prefix + ".zeroed_out", "w")
    except IOError:
        msg = "%(prefix).zeroed_out: can't write file" % locals()
        raise ProgramError(msg)
    print >>zeroedOutFile, "\t".join(["famID", "indID", "snpID"])

    notGoodEnoughFile = None
    try:
        notGoodEnoughFile = open(prefix + ".not_good_enough", "w")
    except IOError:
        msg = "%(prefix)s.not_good_enough: can't write file" % locals()
        raise ProgramError(msg)
    print >>notGoodEnoughFile, "\t".join(["name", "reason"])

    removedFile = None
    try:
        removedFile = open(prefix + ".removed_duplicates", "w")
    except IOError:
        msg = "%(prefix)s.removed_duplicates: can't write file" % locals()
        raise ProgramError(msg)

    notGoodEnoughSnps = set()

    # Split the tped in 'snpInfo' and 'genotypes'
    snpInfo = tped[:, :4]
    genotypes = tped[:, 4:]

    # The sed of index we want to get rid of at the end
    getRidOfIndex = set()

    for snpID, indexes in snps.iteritems():
        if snpID not in snpsToComplete:
            # We don't want to complete this SNP, so we continue to next SNP
            continue

        # Getting the completion
        completionToRemove = set(
            np.where(completion[indexes] < completionT)[0]
        )
        for k in completionToRemove:
            notGoodEnoughSnps.add((snpInfo[indexes][k, 1], "completion"))

        # Getting the concordance
        concordanceToRemove = set(
            np.where(concordance[snpID] < concordanceT)[0]
        )
        for k in concordanceToRemove:
            notGoodEnoughSnps.add((snpInfo[indexes][k, 1], "concordance"))

        # These will be the indexes to remove
        indexesToRemove = set()
        for index in completionToRemove | concordanceToRemove:
            indexesToRemove.add(indexes[index])

        # These are the indexes to keep
        indexesToKeep = []
        for index in indexes:
            if index not in indexesToRemove:
                indexesToKeep.append(index)

        # Getting the chosen SNP
        chosenOne = chosenSNPs[snpID]
        if chosenOne not in set(indexesToKeep):
            # The chosen SNP is not a good SNP, so we go to next SNP
            logger.warning("  - {} chosen but not good enough".format(
                snpInfo[chosenOne, 1],
            ))
            continue

        # Now cycling through the genotypes
        nbSamples = genotypes.shape[1]
        for sampleIndex in xrange(nbSamples):
            # We need to remove the no call and keep the unique genotypes
            curGenotypes = genotypes[indexesToKeep, sampleIndex]
            cleanedCurGenotypes = curGenotypes[
                np.where(curGenotypes != "0 0")
            ]
            uniqueCleanedCurGenotypes = np.unique(cleanedCurGenotypes)

            # Checking the number of unique genotypes
            toComplete = False
            if len(uniqueCleanedCurGenotypes) > 1:
                # There are more than one unique genotype (except 0 0)
                # len = 0 means all were 0 0
                # len = 1 means they are all the same
                # len > 1 means discordance (might need to flip)
                # Just need to check the order of the alleles
                possibleAlleles = [
                    set() for k in xrange(len(uniqueCleanedCurGenotypes))
                ]
                for k, geno in enumerate(uniqueCleanedCurGenotypes):
                    possibleAlleles[k] |= set(geno.split(" "))
                allEqual = True
                for k in xrange(len(possibleAlleles)):
                    for l in xrange(k+1, len(possibleAlleles)):
                        if possibleAlleles[k] != possibleAlleles[l]:
                            allEqual = False

                if not allEqual:
                    # The genotypes are not all equal, we set the chosen
                    # genotype to null (0 0)
                    tped[chosenOne, sampleIndex+4] = "0 0"
                    print >>zeroedOutFile, "\t".join([tfam[sampleIndex, 0],
                                                      tfam[sampleIndex, 1],
                                                      snpInfo[chosenOne, 1]])
                elif genotypes[chosenOne, sampleIndex] == "0 0":
                    toComplete = True
            elif ((len(uniqueCleanedCurGenotypes) == 1) and
                    (genotypes[chosenOne, sampleIndex] == "0 0")):
                toComplete = True

            if toComplete:
                # We complete the current individual
                tped[chosenOne, sampleIndex+4] = uniqueCleanedCurGenotypes[0]

        # We keep only the chose one
        for index in indexes:
            if index != chosenOne:
                getRidOfIndex.add(index)
                print >>removedFile, snpInfo[index, 1]

    # Writing the not good enough file
    for item in notGoodEnoughSnps:
        print >>notGoodEnoughFile, "\t".join(item)

    # Closing the output files
    zeroedOutFile.close()
    notGoodEnoughFile.close()

    # Printing the chosen file
    try:
        shutil.copy(tfamFileName, prefix + ".chosen_snps.tfam")
    except IOError:
        msg = "%(tfamFileName)s: can't copy file to " \
              "%(prefix)s.chosen_snps.tfam" % locals()
        raise ProgramError(msg)
    chosenFile = None
    try:
        chosenFile = open(prefix + ".chosen_snps.tped", "w")
    except IOError:
        msg = "%(prefix)s.chosen_snps.tped: can't write file" % locals()
        raise ProgramError(msg)
    for chosenOne in chosenSNPs.itervalues():
        snpID = (tped[chosenOne, 0], tped[chosenOne, 3])
        if snpID in snpsToComplete:
            print >>chosenFile, "\t".join(tped[chosenOne])
    chosenFile.close()

    return tped, getRidOfIndex


[docs]def chooseBestSnps(tped, snps, trueCompletion, trueConcordance, prefix):
    """Choose the best duplicates according to the completion and concordance.

    :param tped: a representation of the ``tped`` of duplicated markers.
    :param snps: the position of the duplicated markers in the ``tped``.
    :param trueCompletion: the completion of each markers.
    :param trueConcordance: the pairwise concordance of each markers.
    :param prefix: the prefix of the output files.

    :type tped: numpy.array
    :type snps: dict
    :type trueCompletion: numpy.array
    :type trueConcordance: dict
    :type prefix: str

    :returns: a tuple containing the chosen indexes (:py:class:`dict`) as the
              first element, the completion (:py:class:`numpy.array`) as the
              second element, and the concordance (:py:class:`dict`) as last
              element.

    It creates two output files: ``prefix.chosen_snps.info`` and
    ``prefix.not_chosen_snps.info``. The first one contains the markers that
    were chosen for completion, and the second one, the markers that weren't.

    It starts by computing the completion of each markers (dividing the number
    of calls divided by the total number of genotypes). Then, for each of the
    duplicated markers, we choose the best one according to completion and
    concordance (see explanation in
    :py:func:`DupSamples.duplicated_samples.chooseBestDuplicates` for more
    details).

    """
    # The output files
    chosenFile = None
    try:
        chosenFile = open(prefix + ".chosen_snps.info", "w")
    except IOError:
        msg = "%(prefix)s.chosen_snps.info: can't write file" % locals()
        raise ProgramError(msg)

    excludedFile = None
    try:
        excludedFile = open(prefix + ".not_chosen_snps.info", "w")
    except IOError:
        msg = "%(prefix)s.not_chosen_snps.info: can't write file" % locals()
        raise ProgramError(msg)

    # Computing the completion
    completion = np.true_divide(trueCompletion[0], trueCompletion[1])

    # For each duplicated SNPs
    chosenIndexes = {}
    snpConcordance = {}
    for snp, indexes in snps.iteritems():
        # Getting the completion for those duplicated SNPs
        currCompletion = completion[indexes]

        # Sorting those completion
        sortedCompletionInsexes = np.argsort(currCompletion)

        # Getting the concordance
        concordance = np.true_divide(trueConcordance[snp][0],
                                     trueConcordance[snp][1])

        currConcordance = [[] for i in xrange(len(indexes))]
        for i in xrange(len(indexes)):
            indexToKeep = list(set(range(len(indexes))) - set([i]))
            currConcordance[i] = np.mean(concordance[i, indexToKeep])
        currConcordance = np.array(currConcordance)
        if snp not in snpConcordance:
            snpConcordance[snp] = currConcordance

        # Sorting the concordance
        sortedConcordanceIndexes = np.argsort(currConcordance)

        # Trying to find the best duplicate to keep
        nbToCheck = 1
        chosenIndex = None
        while nbToCheck <= len(indexes):
            # Getting the `nbToCheck` best value (higher to lower)
            completionValue = currCompletion[
                sortedCompletionInsexes[nbToCheck*-1]
            ]
            concordanceValue = currConcordance[
                sortedConcordanceIndexes[nbToCheck*-1]
            ]

            # Getting the indexes to consider
            completionToConsider = set(
                np.where(currCompletion >= completionValue)[0]
            )
            concordanceToConsider = set(
                np.where(currConcordance >= concordanceValue)[0]
            )

            # Getting the intersection of the indexes
            toConsider = concordanceToConsider & completionToConsider
            if len(toConsider) >= 1:
                chosenIndex = random.choice(list(toConsider))
                break
            nbToCheck += 1

        if chosenIndex is None:
            msg = "Could not choose the best snp ID"
            raise ProgramError(msg)

        # Printing the chosen SNPs
        print >>chosenFile, tped[indexes[chosenIndex], 1]

        # Printing the excluded SNPs
        for i, index in enumerate(indexes):
            if i != chosenIndex:
                print >>excludedFile, tped[index, 1]

        chosenIndexes[snp] = indexes[chosenIndex]

    # Closing the output files
    chosenFile.close()
    excludedFile.close()

    return chosenIndexes, completion, snpConcordance


[docs]def computeFrequency(prefix, outPrefix):
    """Computes the frequency of the SNPs using Plink.

    :param prefix: the prefix of the input files.
    :param outPrefix: the prefix of the output files.

    :type prefix: str
    :type outPrefix: str

    :returns: a :py:class:`dict` containing the frequency of each marker.

    Start by computing the frequency of all markers using Plink. Then, it reads
    the output file, and saves the frequency and allele information.

    """
    # The plink command
    plinkCommand = ["plink", "--noweb", "--tfile", prefix, "--freq", "--out",
                    outPrefix + ".duplicated_snps"]
    runCommand(plinkCommand)

    # Reading the frequency file
    snpFreq = {}
    try:
        with open(outPrefix + ".duplicated_snps.frq", "r") as inputFile:
            headerIndex = None
            for i, line in enumerate(inputFile):
                row = createRowFromPlinkSpacedOutput(line)
                if i == 0:
                    # This is the header
                    headerIndex = dict([
                        (row[j], j) for j in xrange(len(row))
                    ])

                    # Checking the column titles
                    for columnTitle in ["SNP", "MAF", "A1", "A2"]:
                        if columnTitle not in headerIndex:
                            msg = "%(outPrefix)s.duplicated_snps.frq: no " \
                                  "column called %(columnTitle)s" % locals()
                            raise ProgramError(msg)
                else:
                    # This is data
                    snpName = row[headerIndex["SNP"]]
                    maf = row[headerIndex["MAF"]]
                    a1 = row[headerIndex["A1"]]
                    a2 = row[headerIndex["A2"]]
                    try:
                        if maf == "NA":
                            maf = 0.0
                        else:
                            maf = float(maf)
                    except ValueError:
                        msg = "%(outPrefix)s.duplicated_snps.frq: %(maf)s: " \
                              "not a valid MAF" % locals()
                        raise ProgramError(msg)
                    snpFreq[snpName] = (maf, (a1, a2))
    except IOError:
        msg = "%(outPrefix)s.duplicated_snps.freq: no such file" % locals()
        raise ProgramError(msg)

    return snpFreq


[docs]def runCommand(command):
    """Run the command in Plink.

    :param command: the command to run.

    :type command: list

    Tries to run a command using :py:mod:`subprocess`.

    """
    output = None
    try:
        output = subprocess.check_output(command,
                                         stderr=subprocess.STDOUT, shell=False)
    except subprocess.CalledProcessError:
        msg = "plink: couldn't run plink\n"
        msg += " ".join(command)
        raise ProgramError(msg)


[docs]def printDuplicatedTPEDandTFAM(tped, tfamFileName, outPrefix):
    """Print the duplicated SNPs TPED and TFAM.

    :param tped: a representation of the ``tped`` of duplicated markers.
    :param tfamFileName: the name of the original ``tfam`` file.
    :param outPrefix: the output prefix.

    :type tped: numpy.array
    :type tfamFileName: str
    :type outPrefix: str

    First, it copies the original ``tfam`` into
    ``outPrefix.duplicated_snps.tfam``. Then, it prints the ``tped`` of
    duplicated markers in ``outPrefix.duplicated_snps.tped``.

    """
    # Copying the tfam file
    try:
        shutil.copy(tfamFileName, outPrefix + ".duplicated_snps.tfam")
    except IOError:
        msg = "%(tfamFileName)s: can't copy file to " \
              "%(outPrefix)s.duplicated_snps.tfam" % locals()
        raise ProgramError(msg)

    # Writing the tped
    tpedFile = None
    try:
        tpedFile = open(outPrefix + ".duplicated_snps.tped", "w")
    except IOError:
        msg = "%(outPrefix)s.duplicated_snps.tped: can't write " \
              "file" % locals()
        raise ProgramError(msg)
    for row in tped:
        print >>tpedFile, "\t".join(row)
    tpedFile.close()


[docs]def printConcordance(concordance, prefix, tped, snps):
    """Print the concordance.

    :param concordance: the concordance.
    :param prefix: the prefix if the output files.
    :param tped: a representation of the ``tped`` of duplicated markers.
    :param snps: the position of the duplicated markers in the ``tped``.

    :type concordance: dict
    :type prefix: str
    :type tped: numpy.array
    :type snps: dict

    Prints the concordance in a file, in the format of a matrix. For each
    duplicated markers, the first line (starting with the `#` signs) contains
    the name of all the markers in the duplicated markers set. Then a :math:`N
    \\times N` matrix is printed to file (where :math:`N` is the number of
    markers in the duplicated marker list), containing the pairwise
    concordance.

    """
    outFile = None
    try:
        outFile = open(prefix + ".concordance", "w")
    except IOError:
        msg = "%s: can't write file" % prefix + ".concordance"
        raise ProgramError(msg)

    for snpID in concordance.iterkeys():
        print >>outFile, "#" + "\t".join(
            list(snpID) + list(tped[snps[snpID], 1])
        )

        # Doing the division
        true_concordance = np.true_divide(concordance[snpID][0],
                                          concordance[snpID][1])

        output = StringIO.StringIO()
        np.savetxt(output, true_concordance, delimiter="\t", fmt="%.8f")
        print >>outFile, output.getvalue().rstrip("\r\n")

    outFile.close()


[docs]def printProblems(completion, concordance, tped, snps, frequencies, prefix,
                  diffFreq):
    """Print the statistics.

    :param completion: the completion of each duplicated markers.
    :param concordance: the pairwise concordance between duplicated markers.
    :param tped: a representation of the ``tped`` of duplicated markers.
    :param snps: the positions of the duplicated markers in the ``tped``
    :param frequencies: the frequency of each of the duplicated markers.
    :param prefix: the prefix of the output files.
    :param diffFreq: the frequency difference threshold.

    :type completion: numpy.array
    :type concordance: dict
    :type tped: numpy.array
    :type snps: dict
    :type frequencies: dict
    :type prefix: str
    :type diffFreq: float

    :returns: a :py:class:`set` containing duplicated markers to complete.

    Creates a summary file (``prefix.summary``) containing information about
    duplicated markers: chromosome, position, name, alleles, status, completion
    percentage, completion number and mean concordance.

    The frequency and the minor allele are used to be certain that two
    duplicated markers are exactly the same marker (and not a tri-allelic one,
    for example).

    For each duplicated markers:

    1. Constructs the set of available alleles for the first marker.
    2. Constructs the set of available alleles for the second marker.
    3. If the two sets are different, but the number of alleles is the same, we
       try to flip one of the marker. If the two sets are the same, but the
       number of alleles is 1, we set the status to ``homo_flip``. If the
       markers are heterozygous, we set the status to ``flip``.
    4. If there is a difference in the number of alleles (one is homozygous,
       the other, heterozygous), and that there is on allele in common, we set
       the status to ``homo_hetero``. If there are no allele in common, we try
       to flip one. If the new sets have one allele in common, we set the
       status to ``homo_hetero_flip``.
    5. If the sets of available alleles are the same (without flip), we check
       the frequency and the minor alleles. If the minor allele is different,
       we set the status to ``diff_minor_allele``. If the difference in
       frequencies is higher than a threshold, we set the status to
       ``diff_frequency``.
    6. If all of the above fail, we set the status to ``problem``.

    Problems are written in the ``prefix.problems`` file, and contains the
    following columns: chromosome, position, name and status. This file
    contains all the markers with a status, as explained above.

    """

    completionPercentage = np.true_divide(completion[0], completion[1])

    outSummary = None
    try:
        outSummary = open(prefix + ".summary", "w")
    except IOError:
        msg = "%s: can't write file" % prefix + ".summary"
        raise ProgramError

    # Prints the header of the summary file
    print >>outSummary, "\t".join(["chr", "pos", "name", "alleles", "status",
                                   "% completion", "completion",
                                   "mean concordance"])

    # The data structure containing the problems
    problems = {}
    for snpID, indexes in snps.iteritems():
        for i, index in enumerate(indexes):
            # The SNP information (chromosome and position)
            toPrint = list(snpID)

            # The name of the SNP
            snpName = tped[index, 1]
            toPrint.append(snpName)

            # The frequency of the SNP
            snpFreq, mafAlleles = frequencies[snpName]

            # A list of the other SNP name with problems
            otherSnpNameWithProblem = set()

            # The alleles
            alleles = set()
            otherAlleles = set()
            status = []
            for genotype in np.unique(tped[index, 4:]):
                alleles |= set(genotype.split(" "))
            if "0" in alleles:
                alleles.remove("0")
            for j in xrange(i+1, len(indexes)):
                otherIndex = indexes[j]
                otherSnpName = tped[otherIndex, 1]

                # The frequency of the other SNP
                otherSnpFreq, otherMafAlleles = frequencies[otherSnpName]

                # Checking the alleles
                for genotype in np.unique(tped[otherIndex, 4:]):
                    otherAlleles |= set(genotype.split(" "))
                if "0" in otherAlleles:
                    otherAlleles.remove("0")
                if alleles != otherAlleles:
                    if len(alleles) == len(otherAlleles):
                        # Same number of alleles
                        # Try the flipped ones
                        otherAlleles = flipGenotype(otherAlleles)
                        if alleles == otherAlleles:
                            if len(alleles) == 1:
                                status.append("homo_flip")
                                otherSnpNameWithProblem.add(otherSnpName)
                            else:
                                status.append("flip")
                                otherSnpNameWithProblem.add(otherSnpName)
                        else:
                            status.append("problem")
                            otherSnpNameWithProblem.add(otherSnpName)
                    else:
                        # Different number of alleles
                        if len(alleles & otherAlleles) == 1:
                            status.append("homo_hetero")
                            otherSnpNameWithProblem.add(otherSnpName)
                        else:
                            # Try the flipped one
                            otherAlleles = flipGenotype(otherAlleles)
                            if len(alleles & otherAlleles) == 1:
                                status.append("homo_hetero_flip")
                                otherSnpNameWithProblem.add(otherSnpName)
                            else:
                                status.append("problem")
                                otherSnpNameWithProblem.add(otherSnpName)
                else:
                    # The alleles are the same, so we check the frequency
                    if mafAlleles[0] != otherMafAlleles[0]:
                        # They don't have the same minor allele
                        status.append("diff_minor_allele")
                        otherSnpNameWithProblem.add(otherSnpName)
                    elif math.fabs(snpFreq - otherSnpFreq) > diffFreq:
                        # They don't have same frequency
                        status.append("diff_frequency")
                        otherSnpNameWithProblem.add(otherSnpName)

            alleles = list(alleles)
            alleles.sort()
            if len(alleles) == 1:
                alleles.append(alleles[0])
            toPrint.append(" ".join(alleles))
            toPrint.append(";".join(status))

            # The completion
            toPrint.append("%.8f" % completionPercentage[index])
            toPrint.append("%d/%d" % (completion[0][index],
                                      completion[1][index]))

            # The concordance
            indexToKeep = list(set(range(len(indexes))) - set([i]))
            currConcordance = np.true_divide(
                concordance[snpID][0][i, indexToKeep],
                concordance[snpID][1][i, indexToKeep],
            )
            currConcordance = np.mean(currConcordance)
            toPrint.append("%.8f" % currConcordance)
            print >>outSummary, "\t".join(toPrint)

            # Now updating the problems data structure
            if len(status) != len(otherSnpNameWithProblem):
                msg = "There is a problem with the problematic SNPs"
                raise ProgramError(msg)

            if len(status) > 0:
                if snpID not in problems:
                    tmp = {"snpNames": {snpName}, "problems": set()}
                    problems[snpID] = tmp

                # We have problems
                problems[snpID]["snpNames"] |= otherSnpNameWithProblem
                problems[snpID]["problems"] |= set(status)

    outSummary.close()

    outProblems = None
    try:
        outProblems = open(prefix + ".problems", "w")
    except IOError:
        msg = "%s: can't write file" % prefix + ".problems"
        raise ProgramError

    # Printing the header of the problem file...
    print >>outProblems, "\t".join(["chr", "pos", "name", "status"])
    for snpID in problems.iterkeys():
        toPrint = list(snpID)
        toPrint.append(";".join(list(problems[snpID]["snpNames"])))
        toPrint.append(";".join(list(problems[snpID]["problems"])))
        print >>outProblems, "\t".join(toPrint)

    outProblems.close()

    # Returning the SNPs to complete
    return set(snps.keys()) - set(problems.keys())


[docs]def computeStatistics(tped, tfam, snps):
    """Computes the completion and concordance of each SNPs.

    :param tped: a representation of the ``tped``.
    :param tfam: a representation of the ``tfam``
    :param snps: the position of the duplicated markers in the ``tped``.

    :type tped: numpy.array
    :type tfam: list
    :type snps: dict

    :returns: a tuple containing the completion of duplicated markers
              (:py:class:`numpy.array`) as first element, and the concordance
              (:py:class:`dict`) of duplicated markers, as last element.

    A marker's completion is compute using this formula (where :math:`G_i` is
    the set of genotypes for the marker :math:`i`):

    .. math::
        Completion_i = \\frac{||g \\in G_i \\textrm{ where } g \\neq 0||}
                             {||G_i||}

    The pairwise concordance between duplicated markers is compute as follow
    (where :math:`G_i` and :math:`G_j` are the sets of genotypes for markers
    :math:`i` and :math:`j`, respectively):

    .. math::
        Concordance_{i,j} = \\frac{
            ||g \\in G_i \\cup G_j \\textrm{ where } g_i = g_j \\neq 0||
        }{
            ||g \\in G_i \\cup G_j \\textrm{ where } g \\neq 0||
        }

    Hence, we only computes the numerators and denominators of the completion
    and concordance, for future reference.

    .. note::
        When the genotypes are not comparable, the function tries to flip one
        of the genotype to see if it becomes comparable.

    """
    # The completion data type
    completion = np.array([[0 for i in xrange(len(tped))],
                           [0 for i in xrange(len(tped))]])

    # The concordance data type
    concordance = {}
    for snpID in snps.keys():
        nbDup = len(snps[snpID])
        concordance[snpID] = [
            np.asmatrix(np.zeros((nbDup, nbDup), dtype=int)),
            np.asmatrix(np.zeros((nbDup, nbDup), dtype=int))
        ]

    # The women and the no sex
    menIndex = np.where(tfam[:, 4] == "1")
    womenIndex = np.where(tfam[:, 4] == "2")
    noSexIndex = np.where(tfam[:, 4] == "0")

    for snpID, indexes in snps.iteritems():
        nbDup = len(indexes)
        currGenotypes = tped[indexes, 4:]
        chromosome, position = snpID

#         if chromosome == "24":
#             # Remove the heterozygous men
#             menToRemove = getIndexOfHeteroMen(currGenotypes, menIndex)
#             # Remove the women and the no sex
#             currGenotypes = np.delete(currGenotypes,
#                                        np.hstack((womenIndex, noSexIndex,
#                                                    menToRemove)), 1)
#         elif chromosome == "23":
#             # Remove the heterozygous men
#             menToRemove = getIndexOfHeteroMen(currGenotypes, menIndex)
#             # Remove the no sex
#             currGenotypes = np.delete(currGenotypes,
#                                        np.hstack((noSexIndex, menToRemove)),
#                                        1)

        for i in xrange(nbDup):
            # Compute completion here
            completion[0][indexes[i]] = len(
                np.where(currGenotypes[i] != "0 0")[0]
            )
            completion[1][indexes[i]] = len(currGenotypes[i])
            for j in xrange(i+1, nbDup):
                # Compute concordance here
                # Removing samples with at least one null genotype
                nullGenotypeIndexes = np.where(
                    np.any(currGenotypes[[i, j]] == "0 0", 0)
                )
                subGenotypes = np.delete(
                    currGenotypes,
                    nullGenotypeIndexes,
                    1,
                )

                # Finding the errors in the subseted genotypes
                errorIndexes = np.where(subGenotypes[i] != subGenotypes[j])[0]
                nbDiff = len(errorIndexes)

                for k in errorIndexes:
                    # Getting the genotypes
                    genotype1 = set(subGenotypes[i, k].split(" "))
                    genotype2 = set(subGenotypes[j, k].split(" "))

                    # Checking for flips
                    if len(genotype1) == len(genotype2):
                        # Both have the same number of different alleles,
                        # so they might be flipped
                        genotype2 = flipGenotype(genotype2)
                        if genotype1 == genotype2:
                            # The genotypes are equivalent after the flip
                            nbDiff -= 1

                # Updating the concordance
                nbTot = len(subGenotypes[i])
                concordance[snpID][0][i, j] = nbTot - nbDiff
                concordance[snpID][0][j, i] = nbTot - nbDiff
                if nbTot == 0:
                    # We will have a division by 0...
                    nbTot = 1
                concordance[snpID][1][i, j] = nbTot
                concordance[snpID][1][j, i] = nbTot

    for snpID in concordance.iterkeys():
        for i in range(len(concordance[snpID][0])):
            concordance[snpID][0][i, i] = 1
            concordance[snpID][1][i, i] = 1

    return completion, concordance


[docs]def getIndexOfHeteroMen(genotypes, menIndex):
    """Get the indexes of heterozygous men.

    :param genotypes: the genotypes of everybody.
    :param menIndex: the indexes of the men (for the genotypes).

    :type genotypes: numpy.array
    :type menIndex: numpy.array

    :returns: a :py:class:`numpy.array` containing the indexes of the genotypes
              to remove.

    Finds the mean that have a heterozygous genotype for this current marker.
    Usually used on sexual chromosomes.

    """
    toRemove = set()
    for i in menIndex[0]:
        for genotype in [set(j.split(" ")) for j in genotypes[:, i]]:
            if len(genotype) != 1:
                # We have an heterozygous
                toRemove.add(i)

    toRemove = list(toRemove)
    toRemove.sort()

    return (np.array(toRemove, dtype=int),)


[docs]def flipGenotype(genotype):
    """Flips a genotype.

    :param genotype: the genotype to flip.

    :type genotype: set

    :returns: the new flipped genotype (as a :py:class:`set`)

    .. testsetup::

        from pyGenClean.DupSNPs.duplicated_snps import flipGenotype

    .. doctest::

        >>> flipGenotype({"A", "T"})
        set(['A', 'T'])
        >>> flipGenotype({"C", "T"})
        set(['A', 'G'])
        >>> flipGenotype({"T", "G"})
        set(['A', 'C'])
        >>> flipGenotype({"0", "0"})
        Traceback (most recent call last):
            ...
        ProgramError: 0: unkown allele
        >>> flipGenotype({"A", "N"})
        Traceback (most recent call last):
            ...
        ProgramError: N: unkown allele

    """
    newGenotype = set()
    for allele in genotype:
        if allele == "A":
            newGenotype.add("T")
        elif allele == "C":
            newGenotype.add("G")
        elif allele == "T":
            newGenotype.add("A")
        elif allele == "G":
            newGenotype.add("C")
        else:
            msg = "%(allele)s: unknown allele" % locals()
            raise ProgramError(msg)

    return newGenotype


# def processTPED(uniqueSNPs, duplicatedSNPs, mapF, fileName, tfam, prefix):
[docs]def processTPED(uniqueSNPs, mapF, fileName, tfam, prefix):
    """Process the TPED file.

    :param uniqueSNPs: the unique markers.
    :param mapF: a representation of the ``map`` file.
    :param fileName: the name of the ``tped`` file.
    :param tfam: the name of the ``tfam`` file.
    :param prefix: the prefix of all the files.

    :type uniqueSNPs: dict
    :type mapF: list
    :type fileName: str
    :type tfam: str
    :type prefix: str

    :returns: a tuple with the representation of the ``tped`` file
              (:py:class:`numpy.array`) as first element, and the updated
              position of the duplicated markers in the ``tped``
              representation.

    Copies the ``tfam`` file into ``prefix.unique_snps.tfam``. While reading
    the ``tped`` file, creates a new one (``prefix.unique_snps.tped``)
    containing only unique markers.

    """
    # Copying the tfam file
    try:
        shutil.copy(tfam, prefix + ".unique_snps.tfam")
    except IOError:
        msg = "%s: can't write file" % prefix + ".unique_snps.tfam"
        raise ProgramError(msg)

    tped = []
    updatedSNPs = defaultdict(list)
    outputFile = None
    try:
        outputFile = open(prefix + ".unique_snps.tped", "w")
    except IOError:
        msg = "%s: can't write to file" % prefix + ".unique_snps.tped"
        raise ProgramError(msg)
    nbSNP = 0
    with open(fileName, 'r') as inputFile:
        for line in inputFile:
            nbSNP += 1
            row = line.rstrip("\r\n").split("\t")
            snpInfo = row[:4]
            genotype = [i.upper() for i in row[4:]]

            chromosome = snpInfo[0]
            position = snpInfo[3]

            if (chromosome, position) in uniqueSNPs:
                # Printing the new TPED file (unique SNPs only)
                print >>outputFile, "\t".join(snpInfo + genotype)
            else:
                # Saving the TPED file (duplicated samples only)
                currPos = len(tped)
                tped.append(tuple(snpInfo + genotype))
                updatedSNPs[(chromosome, position)].append(currPos)
    outputFile.close()

    if len(mapF) != nbSNP:
        msg = "%(fileName)s: no the same number of SNPs than MAP " \
              "file" % locals()
        raise ProgramError(msg)

    tped = np.array(tped)

    return tped, updatedSNPs


[docs]def findUniques(mapF):
    """Finds the unique markers in a MAP.

    :param mapF: representation of a ``map`` file.

    :type mapF: list

    :returns: a :py:class:`dict` containing unique markers (according to their
              genomic localisation).

    """
    uSNPs = {}
    dSNPs = defaultdict(list)
    for i, row in enumerate(mapF):
        chromosome = row[0]
        position = row[3]
        snpID = (chromosome, position)
        if snpID not in uSNPs:
            # This is the first time we see this sample
            uSNPs[snpID] = i
        else:
            # We have seen this sample at least once...
            if snpID not in dSNPs:
                # This is the second time we see this sample...
                dSNPs[snpID].extend([uSNPs[snpID], i])
            else:
                # We have seen this sample multiple times
                dSNPs[snpID].append(i)

    # Removing the duplicates from the unique samples
    for snpID in dSNPs.iterkeys():
        if snpID in uSNPs:
            del uSNPs[snpID]

    return uSNPs


[docs]def readTFAM(fileName):
    """Reads the TFAM file.

    :param fileName: the name of the ``tfam`` file.

    :type fileName: str

    :returns: a representation the ``tfam`` file (:py:class:`numpy.array`).

    """
    # Saving the TFAM file
    tfam = None
    with open(fileName, 'r') as inputFile:
        tfam = [
            tuple(i.rstrip("\r\n").split("\t")) for i in inputFile.readlines()
        ]

    tfam = np.array(tfam)

    return tfam


[docs]def readMAP(fileName, prefix):
    """Reads the MAP file.

    :param fileName: the name of the ``map`` file.

    :type fileName: str

    :returns: a list of tuples, representing the ``map`` file.

    While reading the ``map`` file, it saves a file
    (``prefix.duplicated_marker_names``) containing the name of the unique
    duplicated markers.

    """
    # Saving the MAP file
    mapF = None
    with open(fileName, 'r') as inputFile:
        mapF = [
            tuple(i.rstrip("\r\n").split("\t")) for i in inputFile.readlines()
        ]

    # Test for uniqueness of names
    marker_names = np.array([i[1] for i in mapF])
    nb_with_same_name = len(marker_names) - len(np.unique(marker_names))
    if nb_with_same_name > 0:
        logger.info("  - {} markers with same name".format(nb_with_same_name))
        u, indices = np.unique(marker_names, return_index=True)
        duplicated_indices = np.setdiff1d(np.arange(len(marker_names)),
                                          indices)
        duplicated_indices = np.in1d(marker_names,
                                     marker_names[duplicated_indices])
        marker_chr_pos = np.array([(i[0], i[3]) for i in mapF],
                                  dtype=[("chr", int),
                                         ("pos", int)])[duplicated_indices]
        marker_names = marker_names[duplicated_indices]
        try:
            duplicated_marker_names = open(prefix + ".duplicated_marker_names",
                                           "w")
        except IOError:
            msg = "{}.duplicated_marker_names: can't write file".format(prefix)
            raise ProgramError(msg)
        for i in xrange(len(marker_names)):
            print >>duplicated_marker_names, "\t".join([marker_names[i]] +
                                                       map(str,
                                                           marker_chr_pos[i]))
        duplicated_marker_names.close()

    return mapF


[docs]def checkArgs(args):
    """Checks the arguments and options.

    :param args: an object containing the options of the program.

    :type args: argparse.Namespace

    :returns: ``True`` if everything was OK.

    If there is a problem with an option, an exception is raised using the
    :py:class:`ProgramError` class, a message is printed to the
    :class:`sys.stderr` and the program exists with code 1.

    """
    # Checking the input files
    for suffix in [".tped", ".tfam", ".map"]:
        fileName = args.tfile + suffix
        if not os.path.isfile(fileName):
            msg = "%(fileName)s: no such file" % locals()
            raise ProgramError(msg)

    # Checking the concordance threshold
    if ((args.snp_concordance_threshold < 0) or
            (args.snp_concordance_threshold > 1)):
        msg = "snp-concordance-threshold: must be between 0 and 1 " \
              "(not %f)" % args.snp_concordance_threshold
        raise ProgramError(msg)

    # Checking the completion threshold
    if ((args.snp_completion_threshold < 0) or
            (args.snp_completion_threshold > 1)):
        msg = "snp-completion-threshold: must be between 0 and 1 " \
              "(not %f)" % args.snp_completion_threshold
        raise ProgramError(msg)

    # Checking the difference in frequency
    if (args.frequency_difference < 0) or (args.frequency_difference > 1):
        msg = ("{}: maximal frequency difference: value must be between 0 and "
               "1 (inclusively)".format())
        raise ProgramError(msg)

    return True


[docs]def parseArgs(argString=None):  # pragma: no cover
    """Parses the command line options and arguments.

    :param argString: the options.

    :type argString: list

    :returns: A :py:class:`argparse.Namespace` object created by the
              :py:mod:`argparse` module. It contains the values of the
              different options.

    =============================== ====== ====================================
                 Options             Type              Description
    =============================== ====== ====================================
    ``--tfile``                     string The input file prefix (Plink tfile).
    ``--snp-completion-threshold``  float  The completion threshold to consider
                                           a replicate when choosing the best
                                           replicates.
    ``--snp-concordance-threshold`` float  The concordance threshold to
                                           consider a replicate when choosing
                                           the best replicates.
    ``--frequency_difference``      float  The maximum difference in frequency
                                           between duplicated markers.
    ``--out``                       string The prefix of the output files.
    =============================== ====== ====================================

    .. note::
        No option check is done here (except for the one automatically done by
        argparse). Those need to be done elsewhere (see :py:func:`checkArgs`).

    """
    args = None
    if argString is None:
        args = parser.parse_args()
    else:
        args = parser.parse_args(argString)

    return args


[docs]class ProgramError(Exception):
    """An :py:class:`Exception` raised in case of a problem.

    :param msg: the message to print to the user before exiting.

    :type msg: str

    """
    def __init__(self, msg):
        """Construction of the :py:class:`ProgramError` class.

        :param msg: the message to print to the user

        :type msg: str

        """
        self.message = str(msg)

    def __str__(self):
        return self.message


# The parser object
pretty_name = "Duplicated markers"
desc = "Extracts and merges duplicated markers."
long_desc = ("The script searches for duplicated markers according to "
             "chromosomal location. It then evaluates concordance, completion "
             "rate, allele calls and minor allele frequency (MAF). The script "
             "keeps markers with different allele calls or with different "
             "MAF. If thresholds are met, the script merges and completes the "
             "genotypes.")
parser = argparse.ArgumentParser(description=desc)
parser.add_argument("-v", "--version", action="version",
                    version="pyGenClean version {}".format(__version__))

# The INPUT files
group = parser.add_argument_group("Input File")
group.add_argument("--tfile", type=str, metavar="FILE", required=True,
                   help=("The input file prefix (will find the tped and tfam "
                         "file by appending the prefix to .tped and .tfam, "
                         "respectively. A .map file is also required."))
# The options
group = parser.add_argument_group("Options")
group.add_argument("--snp-completion-threshold", type=float, metavar="FLOAT",
                   default=0.9, help=("The completion threshold to consider "
                                      "a replicate when choosing the best "
                                      "replicates and for composite creation. "
                                      "[default: %(default).1f]"))
group.add_argument("--snp-concordance-threshold", type=float, metavar="FLOAT",
                   default=0.98, help=("The concordance threshold to consider "
                                       "a replicate when choosing the best "
                                       "replicates and for composite "
                                       "creation. [default: %(default).2f]"))
group.add_argument("--frequency_difference", type=float, metavar="FLOAT",
                   default=0.05, help=("The maximum difference in frequency "
                                       "between duplicated markers [default: "
                                       "%(default).2f]"))
# The OUTPUT files
group = parser.add_argument_group("Output File")
group.add_argument("--out", type=str, metavar="FILE", default="dup_snps",
                   help=("The prefix of the output files. [default: "
                         "%(default)s]"))


[docs]def safe_main():
    """A safe version of the main function (that catches ProgramError)."""
    try:
        main()
    except KeyboardInterrupt:
        logger.info("Cancelled by user")
        sys.exit(0)
    except ProgramError as e:
        logger.error(e.message)
        parser.error(e.message)


if __name__ == "__main__":
    safe_main()