Source code for pyGenClean.DupSamples.duplicated_samples

#!/usr/bin/env python2.7

# This file is part of pyGenClean.
#
# pyGenClean is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# pyGenClean is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# pyGenClean.  If not, see <http://www.gnu.org/licenses/>.


import os
import sys
import shutil
import random
import logging
import argparse
import StringIO
from collections import defaultdict

import numpy as np

from .. import __version__


logger = logging.getLogger("duplicated_samples")


[docs]def main(argString=None):
    """Check for duplicated samples in a tfam/tped file.

    :param argString: the options

    :type argString: list

    Here are the steps for the duplicated samples step.

    1.  Prints the options.
    2.  Reads the ``tfam`` file (:py:func:`readTFAM`).
    3.  Separate the duplicated samples from the unique samples
        (:py:func:`findDuplicates`).
    4.  Writes the unique samples into a file named
        ``prefix.unique_samples.tfam`` (:py:func:`printUniqueTFAM`).
    5.  Reads the ``tped`` file and write into ``prefix.unique_samples.tped``
        the pedigree file for the unique samples (:py:func:`processTPED`).
        Saves in memory the pedigree for the duplicated samples. Updates the
        indexes of the duplicated samples.
    6.  If there are no duplicated samples, simply copies the files
        ``prefix.unique_samples`` (``tped`` and ``tfam``) to
        ``prefix.final.tfam`` and ``prefix..final.tped``, respectively.
    7.  Computes the completion (for each of the duplicated samples) and the
        concordance of each sample pairs (:py:func:`computeStatistics`).
    8.  Prints statistics (concordance and completion)
        (:py:func:`printStatistics`).
    9.  We print the concordance matrix for each duplicated samples
        (:py:func:`printConcordance`).
    10. We print the ``tped`` and the ``tfam`` file for the duplicated samples
        (``prefix.duplicated_samples``)
        (:py:func:`printDuplicatedTPEDandTFAM`).
    11. Choose the best of each duplicates (to keep and to complete) according
        to completion and concordance (:py:func:`chooseBestDuplicates`).
    12. Creates a unique ``tped`` and ``tfam`` from the duplicated samples by
        completing the best chosen one with the other samples
        (:py:func:`createAndCleanTPED`).
    13. Merge the two tfiles together (``prefix.unique_samples`` and
        ``prefix.chosen_samples``) to create the final dataset
        (``prefix.final``) (:py:func:`addToTPEDandTFAM`).

    """
    # Getting and checking the options
    args = parseArgs(argString)
    checkArgs(args)

    logger.info("Options used:")
    for key, value in vars(args).iteritems():
        logger.info("  --{} {}".format(key.replace("_", "-"), value))

    # Reading the tfam file
    logger.info("Reading TFAM")
    tfam = readTFAM(args.tfile + ".tfam")

    # Find duplicated samples
    logger.info("Finding duplicated samples")
    uniqueSamples, duplicatedSamples = findDuplicates(tfam)

    # Prints the unique tfam
    logger.info("Creating TFAM for unique samples")
    printUniqueTFAM(tfam, uniqueSamples, args.out)

    # Process the TPED file
    logger.info("Reading TPED (and creating TPED for unique samples)")
    tped, tpedSamples = processTPED(uniqueSamples, duplicatedSamples,
                                    args.tfile + ".tped", args.out)

    if len(duplicatedSamples) == 0:
        logger.info("There are no duplicates in {}.tfam".format(args.tfile))
        # There are no duplicated samples
        try:
            shutil.copy(args.out + ".unique_samples.tfam",
                        args.out + ".final.tfam")
        except IOError:
            msg = "%s.unique_samples.tfam: can't copy to " \
                  "%s.final.tfam" % (args.out, args.out)
            raise ProgramError(msg)
        try:
            shutil.copy(args.out + ".unique_samples.tped",
                        args.out + ".final.tped")
        except IOError:
            msg = "%s.unique_samples.tped: can't copy to " \
                  "%s.final.tped" % (args.out, args.out)
            raise ProgramError(msg)

    else:
        # We continue
        # Compute statistics
        logger.info("Computing the completion and concordance of duplicated "
                    "samples")
        completion, concordance = computeStatistics(tped, tfam, tpedSamples,
                                                    duplicatedSamples,
                                                    args.out)

        # Print the statistics
        logger.info("Printing the statistics")
        completion_percentage = printStatistics(completion, concordance,
                                                tpedSamples, duplicatedSamples,
                                                args.out)

        # Print the concordance file
        logger.info("Printing the concordance file")
        concordance_percentage = printConcordance(concordance, args.out)

        # Print the duplicated TFAM and TPED
        logger.info("Creating TPED and TFAM for duplicated samples")
        printDuplicatedTPEDandTFAM(tped, tfam, tpedSamples, duplicatedSamples,
                                   args.out)

        # Choose the best duplicates
        logger.info("Choosing the best duplicates")
        chosenSamples, comp, conc = chooseBestDuplicates(
            tped,
            tpedSamples,
            duplicatedSamples,
            completion_percentage,
            concordance_percentage,
            args.out,
        )

        # Clean the genotype of the chosen samples
        logger.info("Cleaning and creating unique TPED and TFAM from "
                    "duplicated samples")
        newTPED, newTFAM = createAndCleanTPED(
            tped,
            tfam,
            tpedSamples,
            duplicatedSamples,
            chosenSamples,
            args.out,
            comp,
            args.sample_completion_threshold,
            conc,
            args.sample_concordance_threshold,
        )

        # Add the chosen TPED and TFAM
        logger.info("Creating final TPED and TFAM file")
        addToTPEDandTFAM(newTPED, newTFAM, args.out,
                         args.out + ".unique_samples")


[docs]def addToTPEDandTFAM(tped, tfam, prefix, toAddPrefix):
    """Append a tfile to another, creating a new one.

    :param tped: the ``tped`` that will be appended to the other one.
    :param tfam: the ``tfam`` that will be appended to the other one.
    :param prefix: the prefix of all the files.
    :param toAddPrefix: the prefix of the final file.

    :type tped: :py:class:`numpy.array`
    :type tfam: :py:class:`numpy.array`
    :type prefix: str
    :type toAddPrefix: str

    Here are the steps of this function:

    1. Writes the ``tped`` into ``prefix.chosen_samples.tped``.
    2. Writes the ``tfam`` into ``prefix.chosen_samples.tfam``.
    3. Copies the previous ``tfam`` (``toAddPrefix.tfam``) into the final
       ``tfam`` (``prefix.final.tfam``).
    4. Append the ``tfam`` to the final ``tfam`` (``prefix.final.tfam``).
    5. Reads the previous ``tped`` (``toAddPrefix.tped``) and append the new
       ``tped`` to it, writing the final one (``prefix.final.tped``).

    .. warning::
        The ``tped`` and ``tfam`` variables need to contain at least one
        sample.

    """
    # First, writing the chosen tped
    tpedFile = None
    try:
        tpedFile = open(prefix + ".chosen_samples.tped", "w")
    except IOError:
        msg = "%(prefix)s.chosen_samples.tped: can't write file" % locals()
        raise ProgramError(msg)
    for row in tped:
        print >>tpedFile, "\t".join(row)
    tpedFile.close()

    # Second, writing the chosen tfam
    tfamFile = None
    try:
        tfamFile = open(prefix + ".chosen_samples.tfam", "w")
    except IOError:
        msg = "%(prefix)s.chosen_samples.tfam: can't write file" % locals()
        raise ProgramError(msg)
    for row in tfam:
        print >>tfamFile, "\t".join(row)
    tfamFile.close()

    # The final TFAM file (copying and appending)
    try:
        shutil.copyfile(toAddPrefix + ".tfam", prefix + ".final.tfam")
    except IOError:
        msg = "%(toAddPrefix)s.tfam: can't copy to " \
              "%(prefix)s.final.tfam" % locals()
        raise ProgramError(msg)
    newTFAM = None
    try:
        newTFAM = open(prefix + ".final.tfam", "a")
    except IOError:
        msg = "%(prefix)s.final.tfam: can't append" % locals()
        raise ProgramError(msg)
    for row in tfam:
        print >>newTFAM, "\t".join(row)
    newTFAM.close()

    # The final TPED file
    newTPED = None
    try:
        newTPED = open(prefix + ".final.tped", "w")
    except IOError:
        msg = "%(prefix)s.final.tped: can't write file" % locals()
        raise ProgramError(msg)
    try:
        with open(toAddPrefix + ".tped", "r") as inputFile:
            for i, line in enumerate(inputFile):
                row = line.rstrip("\r\n").split("\t")
                originalSNP = row[1]
                toAdd = tped[i]
                if originalSNP != toAdd[1]:
                    msg = "%(toAddPrefix)s.tped: not good sort" % locals()
                    raise ProgramError(msg)
                row.extend(list(toAdd[4:]))
                print >>newTPED, "\t".join(row)
    except IOError:
        msg = "%(toAddPrefix)s.tped: no such file" % locals()
        raise ProgramError(msg)
    newTPED.close()


[docs]def createAndCleanTPED(tped, tfam, samples, oldSamples, chosenSamples,
                       prefix, completion, completionT, concordance,
                       concordanceT):
    """Complete a TPED for duplicate samples.

    :param tped: the ``tped`` containing the duplicated samples.
    :param tfam: the ``tfam`` containing the duplicated samples.
    :param samples: the updated position of the samples in the ``tped``
                    containing only duplicated samples.
    :param oldSamples: the original duplicated sample positions.
    :param chosenSamples: the position of the chosen samples.
    :param prefix: the prefix of all the files.
    :param completion: the completion of each of the duplicated samples.
    :param completionT: the completion threshold.
    :param concordance: the pairwise concordance of each of the duplicated
                        samples.
    :param concordanceT: the concordance threshold.

    :type tped: :py:class:`numpy.array`
    :type tfam: :py:class:`numpy.array`
    :type samples: dict
    :type oldSamples: dict
    :type chosenSamples: dict
    :type prefix: str
    :type completion: :py:class:`numpy.array`
    :type completionT: float
    :type concordance: dict
    :type concordanceT: float

    Using a ``tped`` containing duplicated samples, it creates a ``tped``
    containing unique samples by completing a chosen sample with the other
    replicates.

    .. note::
        A chosen sample is not completed using bad replicates (those that
        don't have a concordance or a completion higher than a certain
        threshold). The bad replicates are written in the file
        ``prefix.not_good_enough``.

    """
    zeroedOutFile = None
    try:
        zeroedOutFile = open(prefix + ".zeroed_out", "w")
    except IOError:
        msg = "%(prefix)s.zeroed_out: can't write file" % locals()
        raise ProgramError(msg)
    print >>zeroedOutFile, "\t".join(["famID", "indID", "snpID"])

    notGoodEnoughFile = None
    try:
        notGoodEnoughFile = open(prefix + ".not_good_enough", "w")
    except IOError:
        msg = "%(prefix)s.not_good_enough: can't write file" % locals()
        raise ProgramError(msg)
    print >>notGoodEnoughFile, "\t".join(["origIndex", "dupIndex", "famID",
                                          "indID", "reason"])
    notGoodEnoughSamples = defaultdict(set)

    # Split the tped in 'snpInfo' and 'genotypes'
    snpInfo = tped[:, :4]
    genotypes = tped[:, 4:]

    # Getting the completion and concordance indexes
    completionSampleID = {}
    concordanceSampleID = {}
    for sampleID, indexes in samples.iteritems():
        # Checking the completion
        completionToKeep = np.where(completion[indexes] >= completionT)[0]
        completionToRemove = np.where(completion[indexes] < completionT)[0]
        for k in completionToRemove:
            notGoodEnoughSamples[(str(oldSamples[sampleID][k]+1),
                                  str(indexes[k]+1), sampleID[0],
                                  sampleID[1])].add("completion")

        # Checking the concordance
        concordanceToKeep = np.where(concordance[sampleID] >= concordanceT)[0]
        concordanceToRemove = np.where(
            concordance[sampleID] < concordanceT
        )[0]
        for k in concordanceToRemove:
            notGoodEnoughSamples[(str(oldSamples[sampleID][k]+1),
                                  str(indexes[k]+1), sampleID[0],
                                  sampleID[1])].add("concordance")

        completionSampleID[sampleID] = completionToKeep
        concordanceSampleID[sampleID] = concordanceToKeep

    for i, curSNPgenotypes in enumerate(genotypes):
        for sampleID, indexes in samples.iteritems():
            # Getting the concordance and the completion
            concordanceToKeep = concordanceSampleID[sampleID]
            completionToKeep = completionSampleID[sampleID]

            # Getting the chosen sample index
            chosenOne = chosenSamples[sampleID]

            # Getting the duplicates' genotypes
            curGenotypes = curSNPgenotypes[indexes]

            # Subsetting for the good completion and concordance
            curGenotypes = curGenotypes[np.intersect1d(concordanceToKeep,
                                                       completionToKeep)]

            # Removing the no call from the genotypes
            cleanedCurGenotype = curGenotypes[np.where(curGenotypes != "0 0")]

            # Checking the number of unique genotypes
            uniqueGenotypes = np.unique(cleanedCurGenotype)

            # Checking the number of unique genotypes (except 0 0)
            if len(uniqueGenotypes) > 1:
                # There are more than one unique genotypes (except 0 0)
                # len = 0 means all were 0 0
                # len = 1 means all the same
                # len > 1 means more than one unique genotypes
                possibleAlleles = [
                    set() for k in xrange(len(uniqueGenotypes))
                ]
                for k, geno in enumerate(uniqueGenotypes):
                    possibleAlleles[k] |= set(geno.split(" "))
                allEqual = True
                for k in xrange(len(possibleAlleles)):
                    for l in xrange(k+1, len(possibleAlleles)):
                        if possibleAlleles[k] != possibleAlleles[l]:
                            allEqual = False
                            break
                    if not allEqual:
                        break
                if not allEqual:
                    # Changing the chosen sample's genotype to no call
                    tped[i][chosenOne+4] = "0 0"
                    print >>zeroedOutFile, "\t".join([sampleID[0], sampleID[1],
                                                      snpInfo[i][1]])
            else:
                # Do we want to complete the chosen sample's genotype???
                pass

    # Writing the not good enough file
    for key, value in notGoodEnoughSamples.iteritems():
        print >>notGoodEnoughFile, "\t".join(list(key) + [";".join(value)])

    # Closing the output files
    zeroedOutFile.close()
    notGoodEnoughFile.close()

    # Getting the indexes to keep
    sampleToKeep = np.array(chosenSamples.values()) + 4
    sampleToKeep.sort()
    toKeep = np.append(np.array(range(4)), sampleToKeep)

    # Subsetting the tped
    newTPED = tped[:, toKeep]

    # Getting the indexes of the tfam
    toKeep = []
    for sampleID, indexes in oldSamples.iteritems():
        i = samples[sampleID].index(chosenSamples[sampleID])
        toKeep.append(indexes[i])
    toKeep = np.array(toKeep)
    toKeep.sort()

    # Subsetting the tfam
    newTFAM = tfam[toKeep]

    return newTPED, newTFAM


[docs]def chooseBestDuplicates(tped, samples, oldSamples, completion,
                         concordance_all, prefix):
    """Choose the best duplicates according to the completion rate.

    :param tped: the ``tped`` containing the duplicated samples.
    :param samples: the updated position of the samples in the tped containing
                    only duplicated samples.
    :param oldSamples: the original duplicated sample positions.
    :param completion: the completion of each of the duplicated samples.
    :param concordance_all: the concordance of every duplicated samples.
    :param prefix: the prefix of all the files.

    :type tped: :py:class:`numpy.array`
    :type samples: dict
    :type oldSamples: dict
    :type completion: :py:class:`numpy.array`
    :type concordance_all: dict
    :type prefix: str

    :returns: a tuple where the first element is a list of the chosen samples'
              indexes, the second on is the completion and the last one is the
              concordance (a map).

    These are the steps to find the best duplicated sample:

    1. Sort the list of concordances.
    2. Sort the list of completions.
    3. Choose the best of the concordance and put in a set.
    4. Choose the best of the completion and put it in a set.
    5. Compute the intersection of the two sets. If there is one sample or
       more, then randomly choose one sample.
    6. If the intersection doesn't contain at least one sample, redo steps 3
       and 4, but increase the number of chosen best by one. Redo step 5 and 6
       (if required).

    The chosen samples are written in ``prefix.chosen_samples.info``. The rest
    are written in ``prefix.excluded_samples.info``.

    """
    # The output files
    chosenFile = None
    try:
        chosenFile = open(prefix + ".chosen_samples.info", "w")
    except IOError:
        msg = "%(prefix)s.chosen_samples.info: can't write file" % locals()
        raise ProgramError(msg)
    print >>chosenFile, "\t".join(["origIndex", "dupIndex", "famID", "indID"])

    excludedFile = None
    try:
        excludedFile = open(prefix + ".excluded_samples.info", "w")
    except IOError:
        msg = "%(prefix)s.excluded_samples.info: can't write file" % locals()
        raise ProgramError(msg)
    print >>excludedFile, "\t".join(["origIndex", "dupIndex", "famID",
                                     "indID"])

    # For each duplicated sample
    chosenIndexes = {}
    sampleConcordance = {}
    for sample, indexes in samples.iteritems():
        # Getting the completion for those duplicated samples
        currCompletion = completion[indexes]

        # Sorting those completion
        sortedCompletionIndexes = np.argsort(currCompletion)

        # Getting the concordance
        concordance = concordance_all[sample]
        currConcordance = [[] for i in xrange(len(indexes))]
        for i in xrange(len(indexes)):
            indexToKeep = list(set(range(len(indexes))) - set([i]))
            currConcordance[i] = np.mean(concordance[i, indexToKeep])
        currConcordance = np.array(currConcordance)
        if sample not in sampleConcordance:
            sampleConcordance[sample] = currConcordance

        # Sorting the concordance
        sortedConcordanceIndexes = np.argsort(currConcordance)

        # Trying to find the best duplicate to keep
        nbToCheck = 1
        chosenIndex = None
        while nbToCheck <= len(indexes):
            # Getting the `nbToCheck` best value (higher to lower)
            completionValue = currCompletion[
                sortedCompletionIndexes[nbToCheck*-1]
            ]
            concordanceValue = currConcordance[
                sortedConcordanceIndexes[nbToCheck*-1]
            ]

            # Getting the indexes to consider
            completionToConsider = set(
                np.where(currCompletion >= completionValue)[0]
            )
            concordanceToConsider = set(
                np.where(currConcordance >= concordanceValue)[0])

            # Getting the intersection of the indexes
            toConsider = concordanceToConsider & completionToConsider
            if len(toConsider) >= 1:
                chosenIndex = random.choice(list(toConsider))
                break
            nbToCheck += 1

        if chosenIndex is None:
            msg = "Could not choose the best sample ID for {}".format(sample)
            raise ProgramError(msg)

        # Printing the chosen samples
        print >>chosenFile, "\t".join([str(oldSamples[sample][chosenIndex]+1),
                                       str(indexes[chosenIndex]+1), sample[0],
                                       sample[1]])

        # Printing the excluded samples
        for i, index in enumerate(indexes):
            if i != chosenIndex:
                print >>excludedFile, "\t".join([str(oldSamples[sample][i]+1),
                                                 str(index+1), sample[0],
                                                 sample[1]])

        chosenIndexes[sample] = indexes[chosenIndex]

    # Closing the output files
    chosenFile.close()
    excludedFile.close()

    return chosenIndexes, completion, sampleConcordance


[docs]def printDuplicatedTPEDandTFAM(tped, tfam, samples, oldSamples, prefix):
    """Print the TPED and TFAM of the duplicated samples.

    :param tped: the ``tped`` containing duplicated samples.
    :param tfam: the ``tfam`` containing duplicated samples.
    :param samples: the updated position of the samples in the tped containing
                    only duplicated samples.
    :param oldSamples: the original duplicated sample positions.
    :param prefix: the prefix of all the files.

    :type tped: :py:class:`numpy.array`
    :type tfam: :py:class:`numpy.array`
    :type samples: dict
    :type oldSamples: dict
    :type prefix: str

    The ``tped`` and ``tfam`` files are written in
    ``prefix.duplicated_samples.tped`` and ``prefix.duplicated_samples.tfam``,
    respectively.

    """
    # Print the TPED
    outputTPED = None
    try:
        outputTPED = open(prefix + ".duplicated_samples.tped", "w")
    except IOError:
        msg = "%(prefix)s.duplicated_samples.tped: can't write " \
              "file" % locals()
        raise ProgramError(msg)
    for row in tped:
        print >>outputTPED, "\t".join(row)
    outputTPED.close()

    # Print the TFAM
    nbSamples = len(tped[0][4:])
    newTFAM = [0 for i in xrange(nbSamples)]
    for samples, indexes in samples.iteritems():
        oldIndexes = oldSamples[samples]
        for i, index in enumerate(indexes):
            oldIndex = oldIndexes[i]
            newTFAM[index] = tfam[oldIndex]
    outputTFAM = None
    try:
        outputTFAM = open(prefix + ".duplicated_samples.tfam", "w")
    except IOError:
        msg = "%(prefix)s.duplicated_samples.tfam: can't write " \
              "file" % locals()
        raise ProgramError(msg)
    for row in newTFAM:
        print >>outputTFAM, "\t".join(row)
    outputTFAM.close()


[docs]def printConcordance(concordance, prefix):
    """Print the concordance.

    :param concordance: the concordance of each sample.
    :param prefix: the prefix of all the files.

    :type concordance: dict
    :type prefix: str

    :returns: the concordance percentage (dict)

    The concordance is the number of genotypes that are equal when comparing a
    duplicated samples with another one, divided by the total number of
    genotypes (excluding genotypes that are no call [*i.e.* ``0``]). If a
    duplicated sample has 100% of no calls, the concordance will be zero.

    The file ``prefix.concordance`` will contain :math:`N \\times N` matrices
    for each set of duplicated samples.

    """
    outFile = None
    try:
        outFile = open(prefix + ".concordance", "w")
    except IOError:
        msg = "%s: can't write file" % prefix + ".concordance"
        raise ProgramError(msg)

    concordance_percentage = {}
    for key in concordance.iterkeys():
        print >>outFile, "#%s\t%s" % key

        # Doing the division
        none_zero = concordance[key][1] != 0
        true_concordance = np.zeros(np.multiply(*concordance[key][1].shape))
        true_concordance[np.ravel(none_zero)] = np.true_divide(
            concordance[key][0][none_zero],
            concordance[key][1][none_zero],
        )
        true_concordance.shape = concordance[key][1].shape
        true_concordance = np.asmatrix(true_concordance)
        concordance_percentage[key] = true_concordance

        output = StringIO.StringIO()
        np.savetxt(output, true_concordance, delimiter="\t", fmt="%.8f")
        print >>outFile, output.getvalue().rstrip("\r\n")

    outFile.close()

    return concordance_percentage


[docs]def printStatistics(completion, concordance, tpedSamples, oldSamples, prefix):
    """Print the statistics in a file.

    :param completion: the completion of each duplicated samples.
    :param concordance: the concordance of each duplicated samples.
    :param tpedSamples: the updated position of the samples in the tped
                        containing only duplicated samples.
    :param oldSamples: the original duplicated sample positions.
    :param prefix: the prefix of all the files.

    :type completion: :py:class:`numpy.array`
    :type concordance: dict
    :type tpedSamples: dict
    :type oldSamples: dict
    :type prefix: str

    :returns: the completion for each duplicated samples, as a
              :py:class:`numpy.array`.

    Prints the statistics (completion of each samples and pairwise concordance
    between duplicated samples) in a file (``prefix.summary``).

    """

    # Compute the completion percentage on none zero values
    none_zero_indexes = np.where(completion[1] != 0)
    completionPercentage = np.zeros(len(completion[0]), dtype=float)
    completionPercentage[none_zero_indexes] = np.true_divide(
        completion[0, none_zero_indexes],
        completion[1, none_zero_indexes],
    )

    # The output file containing the summary statistics (for each of the
    # duplicated samples, print the mean concordance and the completion).
    outputFile = None
    try:
        outputFile = open(prefix + ".summary", "w")
    except IOError:
        msg = "%(prefix)s.summary: can't write file" % locals()
        raise ProgramError(msg)

    print >>outputFile, "\t".join(["origIndex", "dupIndex", "famID", "indID",
                                   "% completion", "completion",
                                   "mean concordance"])
    for sampleID, indexes in tpedSamples.iteritems():
        for i, index in enumerate(indexes):
            # The indexes
            toPrint = [str(oldSamples[sampleID][i]+1), str(index+1)]
            # The samples
            toPrint.extend(list(sampleID))

            # The completion
            toPrint.append("%.8f" % completionPercentage[index])
            toPrint.append("%d/%d" % (completion[0][index],
                                      completion[1][index]))

            # The concordance (not on total values = 0)
            indexToKeep = list(set(range(len(indexes))) - set([i]))
            values = np.ravel(
                np.asarray(concordance[sampleID][0][i, indexToKeep])
            )
            total_values = np.ravel(
                np.asarray(concordance[sampleID][1][i, indexToKeep])
            )
            currConcordance = np.zeros(len(indexToKeep), dtype=float)
            none_zero_indexes = np.where(total_values != 0)
            currConcordance[none_zero_indexes] = np.true_divide(
                values[none_zero_indexes],
                total_values[none_zero_indexes],
            )
            currConcordance = np.mean(currConcordance)
            toPrint.append("%.8f" % currConcordance)
            print >>outputFile, "\t".join(toPrint)

    # Closing the output file
    outputFile.close()

    return completionPercentage


[docs]def computeStatistics(tped, tfam, samples, oldSamples, prefix):
    """Computes the completion and concordance of each samples.

    :param tped: the ``tped`` containing duplicated samples.
    :param tfam: the ``tfam`` containing duplicated samples.
    :param samples: the updated position of the samples in the tped containing
                    only duplicated samples.
    :param oldSamples: the original duplicated sample positions.
    :param prefix: the prefix of all the files.

    :type tped: :py:class:`numpy.array`
    :type tfam: :py:class:`numpy.array`
    :type samples: dict
    :type oldSamples: dict
    :type prefix: str

    :returns: a tuple containing the completion (:py:class:`numpy.array`) as
              first element, and the concordance (:py:class:`dict`) as last
              element.

    Reads the ``tped`` file and compute the completion for each duplicated
    samples and the pairwise concordance between duplicated samples.

    .. note::
        The completion and concordance computation excludes a markers if it's
        on chromosome 24 and if the sample is a female.

    .. note::
        A missing genotype is encoded by ``0``.

    .. note::
        No percentage is computed here, only the numbers. Percentages are
        computing in other functions: :py:func:`printStatistics`, for
        completion, and :py:func:`printConcordance`, for concordance.


    **Completion**

    Computes the completion of none zero values (where all genotypes of at
    least one duplicated sample are no call [*i.e.* ``0``]). The completion of
    sample :math:`i` (*i.e.* :math:`Comp_i`) is the number of genotypes
    that have a call divided by the total number of genotypes (the set
    :math:`G_i`):

    .. math::
        Comp_i = \\frac{||g \\in G_i\\textrm{ where }g \\neq 0||}{||G_i||}

    .. note::
        We consider a genotype as being missing if the sample is a male and if
        a marker on chromosome 23 or 24 is heterozygous.


    **Concordance**

    Computes the pairwise concordance between duplicated samples. For each
    marker, if both genotypes are not missing, we add one to the total number
    of compared markers. If both genotypes are the same, we add one to the
    number of concordant calls. We write the observed genotype difference in
    the file ``prefix.diff``. The concordance between sample :math:`i` and
    :math:`j` (*i.e.* :math:`Concordance_{i,j}`) is the number of genotypes
    that are equal divided by the total number of genotypes (excluding the no
    calls):

    .. math::
        Concordance_{i,j} = \\frac{
            ||g \\in G_i \\cup G_j \\textrm{ where } g_i = g_j \\neq 0||
        }{
            ||g \\in G_i \\cup G_j \\textrm{ where } g \\neq 0||
        }

    """
    # The diff file containing the genotype differences between a pair of
    # duplicated samples
    diffOutput = None
    try:
        diffOutput = open(prefix + ".diff", "w")
    except IOError:
        msg = "%(prefix)s.diff: can't write file" % locals()
        raise ProgramError(msg)
    print >>diffOutput, "\t".join(["name", "famID", "indID", "dupIndex_1",
                                   "dupIndex_2", "genotype_1", "genotype_2"])

    # The completion data type
    completion = np.array([[0 for i in xrange(len(tped[0][4:]))],
                           [0 for i in xrange(len(tped[0][4:]))]])

    # The concordance data type
    concordance = {}
    for sampleID in samples.keys():
        nbDup = len(samples[sampleID])
        concordance[sampleID] = [
            np.asmatrix(np.zeros((nbDup, nbDup), dtype=int)),
            np.asmatrix(np.zeros((nbDup, nbDup), dtype=int)),
        ]

    #####################################################################
    # Add options for concordance only for hetero SNPs... (all samples) #
    #####################################################################

    for row in tped:
        genotype = row[4:]
        chromosome = row[0]
        snpName = row[1]
        for key, indexes in samples.iteritems():
            for i, index in enumerate(indexes):
                sex = tfam[oldSamples[key][i]][4]
                genotype1 = set(genotype[index].split(" "))

                # Updating the completion
                if not ((chromosome == "24") and (sex == "2")):
                    if (sex == "1") and (chromosome in ["23", "24"]):
                        if ("0" not in genotype1) and (len(genotype1) != 2):
                            completion[0][index] += 1
                    elif "0" not in genotype1:
                        completion[0][index] += 1
                    completion[1][index] += 1

                    # Updating the concordance
                    for j in xrange(i + 1, len(indexes)):
                        genotype2 = set(genotype[indexes[j]].split(" "))

# This is for man on chr 23 and 24, if heterozygous!!! #
# if (sex == "1") and (chromosome in ["23", "24"]):
#     if ("0" not in genotype1) and ("0" not in genotype2):
#         if (len(genotype1) != 2) and (len(genotype2) != 2):
#             concordance[key][1][i,j] += 1
#             concordance[key][1][j,i] += 1
#             if genotype1 == genotype2:
#                 concordance[key][0][i,j] += 1
#                 concordance[key][0][j,i] += 1
#             else:
#                 # Print to diff file
#                 toPrint = [snpName, key[0], key[1],
#                            str(index+1),
#                            str(indexes[j]+1)]
#                 toPrint.append(" ".join(genotype1))
#                 if len(genotype1) == 1:
#                     toPrint[-1] += " %s" % toPrint[-1]
#                 toPrint.append(" ".join(genotype2))
#                 if len(genotype2) == 1:
#                     toPrint[-1] += " %s" % toPrint[-1]
#                 print >>diffOutput, "\t".join(toPrint)
#
# elif ("0" not in genotype1) and ("0" not in genotype2):

                        if ("0" not in genotype1) and ("0" not in genotype2):
                            # Both have calls
                            concordance[key][1][i, j] += 1
                            concordance[key][1][j, i] += 1
                            if genotype1 == genotype2:
                                concordance[key][0][i, j] += 1
                                concordance[key][0][j, i] += 1
                            else:
                                # Print to diff file
                                toPrint = [snpName, key[0], key[1],
                                           str(index+1), str(indexes[j]+1)]
                                toPrint.append(" ".join(genotype1))
                                if len(genotype1) == 1:
                                    toPrint[-1] += " %s" % toPrint[-1]
                                toPrint.append(" ".join(genotype2))
                                if len(genotype2) == 1:
                                    toPrint[-1] += " %s" % toPrint[-1]
                                print >>diffOutput, "\t".join(toPrint)

    diffOutput.close()

    for key in concordance.iterkeys():
        for i in range(len(concordance[key][0])):
            concordance[key][0][i, i] = 1
            concordance[key][1][i, i] = 1

    return completion, concordance


[docs]def processTPED(uniqueSamples, duplicatedSamples, fileName, prefix):
    """Process the TPED file.

    :param uniqueSamples: the position of unique samples.
    :param duplicatedSamples: the position of duplicated samples.
    :param fileName: the name of the file.
    :param prefix: the prefix of all the files.

    :type uniqueSamples: dict
    :type duplicatedSamples: collections.defaultdict
    :type fileName: str
    :type prefix: str

    :returns: a tuple containing the ``tped`` (:py:class:`numpy.array`) as
              first element, and the updated positions of the duplicated
              samples (:py:class:`dict`)

    Reads the entire ``tped`` and prints another one containing only unique
    samples (``prefix.unique_samples.tped``). It then creates a
    :py:class:`numpy.array` containing the duplicated samples.

    """
    # Getting the indexes
    uI = sorted(uniqueSamples.values())
    dI = []
    for item in duplicatedSamples.itervalues():
        dI.extend(item)
    dI.sort()

    tped = []
    outputFile = None
    try:
        outputFile = open(prefix + ".unique_samples.tped", "w")
    except IOError:
        msg = "%s: can't write to file" % prefix + ".unique_samples.tped"
        raise ProgramError
    with open(fileName, 'r') as inputFile:
        for line in inputFile:
            row = line.rstrip("\r\n").split("\t")
            snpInfo = row[:4]
            genotype = row[4:]

            # Printing the new TPED file (unique samples only)
            print >>outputFile, "\t".join(snpInfo + [genotype[i] for i in uI])

            # Saving the TPED file (duplicated samples only)
            tped.append(tuple(snpInfo + [genotype[i] for i in dI]))

    # Closing the output file
    outputFile.close()

    # Updating the positions
    updatedSamples = {}
    for sampleID in duplicatedSamples:
        positions = duplicatedSamples[sampleID]
        newPositions = range(len(positions))
        for i, pos in enumerate(positions):
            newPositions[i] = dI.index(pos)
        updatedSamples[sampleID] = newPositions

    tped = np.array(tped)

    return tped, updatedSamples


[docs]def printUniqueTFAM(tfam, samples, prefix):
    """Prints a new TFAM with only unique samples.

    :param tfam: a representation of a TFAM file.
    :param samples: the position of the samples
    :param prefix: the prefix of the output file name

    :type tfam: list
    :type samples: dict
    :type prefix: str

    """
    fileName = prefix + ".unique_samples.tfam"
    try:
        with open(fileName, "w") as outputFile:
            for i in sorted(samples.values()):
                print >>outputFile, "\t".join(tfam[i])
    except IOError:
        msg = "%(fileName)s: no such file"
        raise ProgramError(msg)


[docs]def findDuplicates(tfam):
    """Finds the duplicates in a TFAM.

    :param tfam: representation of a ``tfam`` file.
    :type tfam: list

    :returns: two :py:class:`dict`, containing unique and duplicated samples
              position.

    """
    uSamples = {}
    dSamples = defaultdict(list)
    for i, row in enumerate(tfam):
        sampleID = tuple(row[:2])
        if sampleID not in uSamples:
            # This is the first time we see this sample
            uSamples[sampleID] = i
        else:
            # We have seen this sample at least once...
            if sampleID not in dSamples:
                # This is the second time we see this sample...
                dSamples[sampleID].extend([uSamples[sampleID], i])
            else:
                # We have seen this sample multiple times
                dSamples[sampleID].append(i)

    # Removing the duplicates from the unique samples
    for sampleID in dSamples.iterkeys():
        if sampleID in uSamples:
            del uSamples[sampleID]

    return uSamples, dSamples


[docs]def readTFAM(fileName):
    """Reads the TFAM file.

    :param fileName: the name of the ``tfam`` file.

    :type fileName: str

    :returns: a list of tuples, representing the ``tfam`` file.

    """
    # Saving the TFAM file
    tfam = None
    with open(fileName, 'r') as inputFile:
        tfam = [
            tuple(i.rstrip("\r\n").split("\t")) for i in inputFile.readlines()
        ]

    tfam = np.array(tfam)

    return tfam


[docs]def checkArgs(args):
    """Checks the arguments and options.

    :param args: a :py:class:`argparse.Namespace` object containing the options
                 of the program.

    :type args: :py:class:`argparse.Namespace`

    :returns: ``True`` if everything was OK.

    If there is a problem with an option, an exception is raised using the
    :py:class:`ProgramError` class, a message is printed to the
    :class:`sys.stderr` and the program exists with code 1.

    """
    # Checking the input files
    for suffix in [".tped", ".tfam"]:
        fileName = args.tfile + suffix
        if not os.path.isfile(fileName):
            msg = "%(fileName)s: no such file" % locals()
            raise ProgramError(msg)

    # Checking the concordance threshold
    if ((args.sample_concordance_threshold < 0) or
            (args.sample_concordance_threshold > 1)):
        msg = "sample-concordance-threshold: must be between 0 and 1 " \
              "(not %f)" % args.sample_concordance_threshold
        raise ProgramError(msg)

    # Checking the completion threshold
    if ((args.sample_completion_threshold < 0) or
            (args.sample_completion_threshold > 1)):
        msg = "sample-completion-threshold: must be between 0 and 1 " \
              "(not %f)" % args.sample_completion_threshold
        raise ProgramError(msg)

    return True


[docs]def parseArgs(argString=None):  # pragma: no cover
    """Parses the command line options and arguments.

    :param argString: the options

    :type argString: list

    :returns: A :py:class:`argparse.Namespace` object created by
              the :py:mod:`argparse` module. It contains the values of the
              different options.

    ================================== ====== =================================
                  Options               Type             Description
    ================================== ====== =================================
    ``--tfile``                        string The input file prefix (of type
                                              ``tfile``).
    ``--sample-completion-threshold``  float  The completion threshold.
    ``--sample-concordance-threshold`` float  The concordance threshold.
    ``--out``                          string The prefix of the output files.
    ================================== ====== =================================

    .. note::
        No option check is done here (except for the one automatically done by
        argparse). Those need to be done elsewhere (see :py:func:`checkArgs`).

    """
    args = None
    if argString is None:
        args = parser.parse_args()
    else:
        args = parser.parse_args(argString)

    return args


[docs]class ProgramError(Exception):
    """An :py:class:`Exception` raised in case of a problem.

    :param msg: the message to print to the user before exiting.

    :type msg: str

    """
    def __init__(self, msg):
        """Construction of the :py:class:`ProgramError` class.

        :param msg: the message to print to the user
        :type msg: str

        """
        self.message = str(msg)

    def __str__(self):
        return self.message


# The parser object
pretty_name = "Duplicated samples"
desc = "Extracts and merges duplicated samples."
long_desc = ("The script evaluates concordance and completion rate. If the "
             "thresholds are met, the script merges and completes the "
             "genotypes.")
parser = argparse.ArgumentParser(description=desc)
parser.add_argument("-v", "--version", action="version",
                    version="pyGenClean version {}".format(__version__))

# The INPUT files
group = parser.add_argument_group("Input File")
group.add_argument("--tfile", type=str, metavar="FILE", required=True,
                   help=("The input file prefix (will find the tped and tfam "
                         "file by appending the prefix to .tped and .tfam, "
                         "respectively.) The duplicated samples should have "
                         "the same identification numbers (both family and "
                         "individual ids.)"))
# The options
group = parser.add_argument_group("Options")
group.add_argument("--sample-completion-threshold", type=float,
                   metavar="FLOAT", default=0.9,
                   help=("The completion threshold to consider a replicate "
                         "when choosing the best replicates and for creating "
                         "the composite samples. [default: %(default).1f]"))
group.add_argument("--sample-concordance-threshold", type=float,
                   metavar="FLOAT", default=0.97,
                   help=("The concordance threshold to consider a replicate "
                         "when choosing the best replicates and for creating "
                         "the composite samples. [default: %(default).2f]"))
# The OUTPUT files
group = parser.add_argument_group("Output File")
group.add_argument("--out", type=str, metavar="FILE",
                   default="dup_samples",
                   help=("The prefix of the output files. [default: "
                         "%(default)s]"))


[docs]def safe_main():
    """A safe version of the main function (that catches ProgramError)."""
    try:
        main()
    except KeyboardInterrupt:
        logger.info("Cancelled by user")
        sys.exit(0)
    except ProgramError as e:
        logger.error(e.message)
        parser.error(e.message)


if __name__ == "__main__":
    safe_main()