Source code for pyGenClean.DupSamples.duplicated_samples

#!/usr/bin/env python2.7

# This file is part of pyGenClean.
#
# pyGenClean is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# pyGenClean is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# pyGenClean.  If not, see <http://www.gnu.org/licenses/>.


import os
import sys
import shutil
import random
import logging
import argparse
import StringIO
from collections import defaultdict

import numpy as np

from .. import __version__


logger = logging.getLogger("duplicated_samples")


[docs]def main(argString=None): """Check for duplicated samples in a tfam/tped file. :param argString: the options :type argString: list Here are the steps for the duplicated samples step. 1. Prints the options. 2. Reads the ``tfam`` file (:py:func:`readTFAM`). 3. Separate the duplicated samples from the unique samples (:py:func:`findDuplicates`). 4. Writes the unique samples into a file named ``prefix.unique_samples.tfam`` (:py:func:`printUniqueTFAM`). 5. Reads the ``tped`` file and write into ``prefix.unique_samples.tped`` the pedigree file for the unique samples (:py:func:`processTPED`). Saves in memory the pedigree for the duplicated samples. Updates the indexes of the duplicated samples. 6. If there are no duplicated samples, simply copies the files ``prefix.unique_samples`` (``tped`` and ``tfam``) to ``prefix.final.tfam`` and ``prefix..final.tped``, respectively. 7. Computes the completion (for each of the duplicated samples) and the concordance of each sample pairs (:py:func:`computeStatistics`). 8. Prints statistics (concordance and completion) (:py:func:`printStatistics`). 9. We print the concordance matrix for each duplicated samples (:py:func:`printConcordance`). 10. We print the ``tped`` and the ``tfam`` file for the duplicated samples (``prefix.duplicated_samples``) (:py:func:`printDuplicatedTPEDandTFAM`). 11. Choose the best of each duplicates (to keep and to complete) according to completion and concordance (:py:func:`chooseBestDuplicates`). 12. Creates a unique ``tped`` and ``tfam`` from the duplicated samples by completing the best chosen one with the other samples (:py:func:`createAndCleanTPED`). 13. Merge the two tfiles together (``prefix.unique_samples`` and ``prefix.chosen_samples``) to create the final dataset (``prefix.final``) (:py:func:`addToTPEDandTFAM`). """ # Getting and checking the options args = parseArgs(argString) checkArgs(args) logger.info("Options used:") for key, value in vars(args).iteritems(): logger.info(" --{} {}".format(key.replace("_", "-"), value)) # Reading the tfam file logger.info("Reading TFAM") tfam = readTFAM(args.tfile + ".tfam") # Find duplicated samples logger.info("Finding duplicated samples") uniqueSamples, duplicatedSamples = findDuplicates(tfam) # Prints the unique tfam logger.info("Creating TFAM for unique samples") printUniqueTFAM(tfam, uniqueSamples, args.out) # Process the TPED file logger.info("Reading TPED (and creating TPED for unique samples)") tped, tpedSamples = processTPED(uniqueSamples, duplicatedSamples, args.tfile + ".tped", args.out) if len(duplicatedSamples) == 0: logger.info("There are no duplicates in {}.tfam".format(args.tfile)) # There are no duplicated samples try: shutil.copy(args.out + ".unique_samples.tfam", args.out + ".final.tfam") except IOError: msg = "%s.unique_samples.tfam: can't copy to " \ "%s.final.tfam" % (args.out, args.out) raise ProgramError(msg) try: shutil.copy(args.out + ".unique_samples.tped", args.out + ".final.tped") except IOError: msg = "%s.unique_samples.tped: can't copy to " \ "%s.final.tped" % (args.out, args.out) raise ProgramError(msg) else: # We continue # Compute statistics logger.info("Computing the completion and concordance of duplicated " "samples") completion, concordance = computeStatistics(tped, tfam, tpedSamples, duplicatedSamples, args.out) # Print the statistics logger.info("Printing the statistics") completion_percentage = printStatistics(completion, concordance, tpedSamples, duplicatedSamples, args.out) # Print the concordance file logger.info("Printing the concordance file") concordance_percentage = printConcordance(concordance, args.out) # Print the duplicated TFAM and TPED logger.info("Creating TPED and TFAM for duplicated samples") printDuplicatedTPEDandTFAM(tped, tfam, tpedSamples, duplicatedSamples, args.out) # Choose the best duplicates logger.info("Choosing the best duplicates") chosenSamples, comp, conc = chooseBestDuplicates( tped, tpedSamples, duplicatedSamples, completion_percentage, concordance_percentage, args.out, ) # Clean the genotype of the chosen samples logger.info("Cleaning and creating unique TPED and TFAM from " "duplicated samples") newTPED, newTFAM = createAndCleanTPED( tped, tfam, tpedSamples, duplicatedSamples, chosenSamples, args.out, comp, args.sample_completion_threshold, conc, args.sample_concordance_threshold, ) # Add the chosen TPED and TFAM logger.info("Creating final TPED and TFAM file") addToTPEDandTFAM(newTPED, newTFAM, args.out, args.out + ".unique_samples")
[docs]def addToTPEDandTFAM(tped, tfam, prefix, toAddPrefix): """Append a tfile to another, creating a new one. :param tped: the ``tped`` that will be appended to the other one. :param tfam: the ``tfam`` that will be appended to the other one. :param prefix: the prefix of all the files. :param toAddPrefix: the prefix of the final file. :type tped: :py:class:`numpy.array` :type tfam: :py:class:`numpy.array` :type prefix: str :type toAddPrefix: str Here are the steps of this function: 1. Writes the ``tped`` into ``prefix.chosen_samples.tped``. 2. Writes the ``tfam`` into ``prefix.chosen_samples.tfam``. 3. Copies the previous ``tfam`` (``toAddPrefix.tfam``) into the final ``tfam`` (``prefix.final.tfam``). 4. Append the ``tfam`` to the final ``tfam`` (``prefix.final.tfam``). 5. Reads the previous ``tped`` (``toAddPrefix.tped``) and append the new ``tped`` to it, writing the final one (``prefix.final.tped``). .. warning:: The ``tped`` and ``tfam`` variables need to contain at least one sample. """ # First, writing the chosen tped tpedFile = None try: tpedFile = open(prefix + ".chosen_samples.tped", "w") except IOError: msg = "%(prefix)s.chosen_samples.tped: can't write file" % locals() raise ProgramError(msg) for row in tped: print >>tpedFile, "\t".join(row) tpedFile.close() # Second, writing the chosen tfam tfamFile = None try: tfamFile = open(prefix + ".chosen_samples.tfam", "w") except IOError: msg = "%(prefix)s.chosen_samples.tfam: can't write file" % locals() raise ProgramError(msg) for row in tfam: print >>tfamFile, "\t".join(row) tfamFile.close() # The final TFAM file (copying and appending) try: shutil.copyfile(toAddPrefix + ".tfam", prefix + ".final.tfam") except IOError: msg = "%(toAddPrefix)s.tfam: can't copy to " \ "%(prefix)s.final.tfam" % locals() raise ProgramError(msg) newTFAM = None try: newTFAM = open(prefix + ".final.tfam", "a") except IOError: msg = "%(prefix)s.final.tfam: can't append" % locals() raise ProgramError(msg) for row in tfam: print >>newTFAM, "\t".join(row) newTFAM.close() # The final TPED file newTPED = None try: newTPED = open(prefix + ".final.tped", "w") except IOError: msg = "%(prefix)s.final.tped: can't write file" % locals() raise ProgramError(msg) try: with open(toAddPrefix + ".tped", "r") as inputFile: for i, line in enumerate(inputFile): row = line.rstrip("\r\n").split("\t") originalSNP = row[1] toAdd = tped[i] if originalSNP != toAdd[1]: msg = "%(toAddPrefix)s.tped: not good sort" % locals() raise ProgramError(msg) row.extend(list(toAdd[4:])) print >>newTPED, "\t".join(row) except IOError: msg = "%(toAddPrefix)s.tped: no such file" % locals() raise ProgramError(msg) newTPED.close()
[docs]def createAndCleanTPED(tped, tfam, samples, oldSamples, chosenSamples, prefix, completion, completionT, concordance, concordanceT): """Complete a TPED for duplicate samples. :param tped: the ``tped`` containing the duplicated samples. :param tfam: the ``tfam`` containing the duplicated samples. :param samples: the updated position of the samples in the ``tped`` containing only duplicated samples. :param oldSamples: the original duplicated sample positions. :param chosenSamples: the position of the chosen samples. :param prefix: the prefix of all the files. :param completion: the completion of each of the duplicated samples. :param completionT: the completion threshold. :param concordance: the pairwise concordance of each of the duplicated samples. :param concordanceT: the concordance threshold. :type tped: :py:class:`numpy.array` :type tfam: :py:class:`numpy.array` :type samples: dict :type oldSamples: dict :type chosenSamples: dict :type prefix: str :type completion: :py:class:`numpy.array` :type completionT: float :type concordance: dict :type concordanceT: float Using a ``tped`` containing duplicated samples, it creates a ``tped`` containing unique samples by completing a chosen sample with the other replicates. .. note:: A chosen sample is not completed using bad replicates (those that don't have a concordance or a completion higher than a certain threshold). The bad replicates are written in the file ``prefix.not_good_enough``. """ zeroedOutFile = None try: zeroedOutFile = open(prefix + ".zeroed_out", "w") except IOError: msg = "%(prefix)s.zeroed_out: can't write file" % locals() raise ProgramError(msg) print >>zeroedOutFile, "\t".join(["famID", "indID", "snpID"]) notGoodEnoughFile = None try: notGoodEnoughFile = open(prefix + ".not_good_enough", "w") except IOError: msg = "%(prefix)s.not_good_enough: can't write file" % locals() raise ProgramError(msg) print >>notGoodEnoughFile, "\t".join(["origIndex", "dupIndex", "famID", "indID", "reason"]) notGoodEnoughSamples = defaultdict(set) # Split the tped in 'snpInfo' and 'genotypes' snpInfo = tped[:, :4] genotypes = tped[:, 4:] # Getting the completion and concordance indexes completionSampleID = {} concordanceSampleID = {} for sampleID, indexes in samples.iteritems(): # Checking the completion completionToKeep = np.where(completion[indexes] >= completionT)[0] completionToRemove = np.where(completion[indexes] < completionT)[0] for k in completionToRemove: notGoodEnoughSamples[(str(oldSamples[sampleID][k]+1), str(indexes[k]+1), sampleID[0], sampleID[1])].add("completion") # Checking the concordance concordanceToKeep = np.where(concordance[sampleID] >= concordanceT)[0] concordanceToRemove = np.where( concordance[sampleID] < concordanceT )[0] for k in concordanceToRemove: notGoodEnoughSamples[(str(oldSamples[sampleID][k]+1), str(indexes[k]+1), sampleID[0], sampleID[1])].add("concordance") completionSampleID[sampleID] = completionToKeep concordanceSampleID[sampleID] = concordanceToKeep for i, curSNPgenotypes in enumerate(genotypes): for sampleID, indexes in samples.iteritems(): # Getting the concordance and the completion concordanceToKeep = concordanceSampleID[sampleID] completionToKeep = completionSampleID[sampleID] # Getting the chosen sample index chosenOne = chosenSamples[sampleID] # Getting the duplicates' genotypes curGenotypes = curSNPgenotypes[indexes] # Subsetting for the good completion and concordance curGenotypes = curGenotypes[np.intersect1d(concordanceToKeep, completionToKeep)] # Removing the no call from the genotypes cleanedCurGenotype = curGenotypes[np.where(curGenotypes != "0 0")] # Checking the number of unique genotypes uniqueGenotypes = np.unique(cleanedCurGenotype) # Checking the number of unique genotypes (except 0 0) if len(uniqueGenotypes) > 1: # There are more than one unique genotypes (except 0 0) # len = 0 means all were 0 0 # len = 1 means all the same # len > 1 means more than one unique genotypes possibleAlleles = [ set() for k in xrange(len(uniqueGenotypes)) ] for k, geno in enumerate(uniqueGenotypes): possibleAlleles[k] |= set(geno.split(" ")) allEqual = True for k in xrange(len(possibleAlleles)): for l in xrange(k+1, len(possibleAlleles)): if possibleAlleles[k] != possibleAlleles[l]: allEqual = False break if not allEqual: break if not allEqual: # Changing the chosen sample's genotype to no call tped[i][chosenOne+4] = "0 0" print >>zeroedOutFile, "\t".join([sampleID[0], sampleID[1], snpInfo[i][1]]) else: # Do we want to complete the chosen sample's genotype??? pass # Writing the not good enough file for key, value in notGoodEnoughSamples.iteritems(): print >>notGoodEnoughFile, "\t".join(list(key) + [";".join(value)]) # Closing the output files zeroedOutFile.close() notGoodEnoughFile.close() # Getting the indexes to keep sampleToKeep = np.array(chosenSamples.values()) + 4 sampleToKeep.sort() toKeep = np.append(np.array(range(4)), sampleToKeep) # Subsetting the tped newTPED = tped[:, toKeep] # Getting the indexes of the tfam toKeep = [] for sampleID, indexes in oldSamples.iteritems(): i = samples[sampleID].index(chosenSamples[sampleID]) toKeep.append(indexes[i]) toKeep = np.array(toKeep) toKeep.sort() # Subsetting the tfam newTFAM = tfam[toKeep] return newTPED, newTFAM
[docs]def chooseBestDuplicates(tped, samples, oldSamples, completion, concordance_all, prefix): """Choose the best duplicates according to the completion rate. :param tped: the ``tped`` containing the duplicated samples. :param samples: the updated position of the samples in the tped containing only duplicated samples. :param oldSamples: the original duplicated sample positions. :param completion: the completion of each of the duplicated samples. :param concordance_all: the concordance of every duplicated samples. :param prefix: the prefix of all the files. :type tped: :py:class:`numpy.array` :type samples: dict :type oldSamples: dict :type completion: :py:class:`numpy.array` :type concordance_all: dict :type prefix: str :returns: a tuple where the first element is a list of the chosen samples' indexes, the second on is the completion and the last one is the concordance (a map). These are the steps to find the best duplicated sample: 1. Sort the list of concordances. 2. Sort the list of completions. 3. Choose the best of the concordance and put in a set. 4. Choose the best of the completion and put it in a set. 5. Compute the intersection of the two sets. If there is one sample or more, then randomly choose one sample. 6. If the intersection doesn't contain at least one sample, redo steps 3 and 4, but increase the number of chosen best by one. Redo step 5 and 6 (if required). The chosen samples are written in ``prefix.chosen_samples.info``. The rest are written in ``prefix.excluded_samples.info``. """ # The output files chosenFile = None try: chosenFile = open(prefix + ".chosen_samples.info", "w") except IOError: msg = "%(prefix)s.chosen_samples.info: can't write file" % locals() raise ProgramError(msg) print >>chosenFile, "\t".join(["origIndex", "dupIndex", "famID", "indID"]) excludedFile = None try: excludedFile = open(prefix + ".excluded_samples.info", "w") except IOError: msg = "%(prefix)s.excluded_samples.info: can't write file" % locals() raise ProgramError(msg) print >>excludedFile, "\t".join(["origIndex", "dupIndex", "famID", "indID"]) # For each duplicated sample chosenIndexes = {} sampleConcordance = {} for sample, indexes in samples.iteritems(): # Getting the completion for those duplicated samples currCompletion = completion[indexes] # Sorting those completion sortedCompletionIndexes = np.argsort(currCompletion) # Getting the concordance concordance = concordance_all[sample] currConcordance = [[] for i in xrange(len(indexes))] for i in xrange(len(indexes)): indexToKeep = list(set(range(len(indexes))) - set([i])) currConcordance[i] = np.mean(concordance[i, indexToKeep]) currConcordance = np.array(currConcordance) if sample not in sampleConcordance: sampleConcordance[sample] = currConcordance # Sorting the concordance sortedConcordanceIndexes = np.argsort(currConcordance) # Trying to find the best duplicate to keep nbToCheck = 1 chosenIndex = None while nbToCheck <= len(indexes): # Getting the `nbToCheck` best value (higher to lower) completionValue = currCompletion[ sortedCompletionIndexes[nbToCheck*-1] ] concordanceValue = currConcordance[ sortedConcordanceIndexes[nbToCheck*-1] ] # Getting the indexes to consider completionToConsider = set( np.where(currCompletion >= completionValue)[0] ) concordanceToConsider = set( np.where(currConcordance >= concordanceValue)[0]) # Getting the intersection of the indexes toConsider = concordanceToConsider & completionToConsider if len(toConsider) >= 1: chosenIndex = random.choice(list(toConsider)) break nbToCheck += 1 if chosenIndex is None: msg = "Could not choose the best sample ID for {}".format(sample) raise ProgramError(msg) # Printing the chosen samples print >>chosenFile, "\t".join([str(oldSamples[sample][chosenIndex]+1), str(indexes[chosenIndex]+1), sample[0], sample[1]]) # Printing the excluded samples for i, index in enumerate(indexes): if i != chosenIndex: print >>excludedFile, "\t".join([str(oldSamples[sample][i]+1), str(index+1), sample[0], sample[1]]) chosenIndexes[sample] = indexes[chosenIndex] # Closing the output files chosenFile.close() excludedFile.close() return chosenIndexes, completion, sampleConcordance
[docs]def printDuplicatedTPEDandTFAM(tped, tfam, samples, oldSamples, prefix): """Print the TPED and TFAM of the duplicated samples. :param tped: the ``tped`` containing duplicated samples. :param tfam: the ``tfam`` containing duplicated samples. :param samples: the updated position of the samples in the tped containing only duplicated samples. :param oldSamples: the original duplicated sample positions. :param prefix: the prefix of all the files. :type tped: :py:class:`numpy.array` :type tfam: :py:class:`numpy.array` :type samples: dict :type oldSamples: dict :type prefix: str The ``tped`` and ``tfam`` files are written in ``prefix.duplicated_samples.tped`` and ``prefix.duplicated_samples.tfam``, respectively. """ # Print the TPED outputTPED = None try: outputTPED = open(prefix + ".duplicated_samples.tped", "w") except IOError: msg = "%(prefix)s.duplicated_samples.tped: can't write " \ "file" % locals() raise ProgramError(msg) for row in tped: print >>outputTPED, "\t".join(row) outputTPED.close() # Print the TFAM nbSamples = len(tped[0][4:]) newTFAM = [0 for i in xrange(nbSamples)] for samples, indexes in samples.iteritems(): oldIndexes = oldSamples[samples] for i, index in enumerate(indexes): oldIndex = oldIndexes[i] newTFAM[index] = tfam[oldIndex] outputTFAM = None try: outputTFAM = open(prefix + ".duplicated_samples.tfam", "w") except IOError: msg = "%(prefix)s.duplicated_samples.tfam: can't write " \ "file" % locals() raise ProgramError(msg) for row in newTFAM: print >>outputTFAM, "\t".join(row) outputTFAM.close()
[docs]def printConcordance(concordance, prefix): """Print the concordance. :param concordance: the concordance of each sample. :param prefix: the prefix of all the files. :type concordance: dict :type prefix: str :returns: the concordance percentage (dict) The concordance is the number of genotypes that are equal when comparing a duplicated samples with another one, divided by the total number of genotypes (excluding genotypes that are no call [*i.e.* ``0``]). If a duplicated sample has 100% of no calls, the concordance will be zero. The file ``prefix.concordance`` will contain :math:`N \\times N` matrices for each set of duplicated samples. """ outFile = None try: outFile = open(prefix + ".concordance", "w") except IOError: msg = "%s: can't write file" % prefix + ".concordance" raise ProgramError(msg) concordance_percentage = {} for key in concordance.iterkeys(): print >>outFile, "#%s\t%s" % key # Doing the division none_zero = concordance[key][1] != 0 true_concordance = np.zeros(np.multiply(*concordance[key][1].shape)) true_concordance[np.ravel(none_zero)] = np.true_divide( concordance[key][0][none_zero], concordance[key][1][none_zero], ) true_concordance.shape = concordance[key][1].shape true_concordance = np.asmatrix(true_concordance) concordance_percentage[key] = true_concordance output = StringIO.StringIO() np.savetxt(output, true_concordance, delimiter="\t", fmt="%.8f") print >>outFile, output.getvalue().rstrip("\r\n") outFile.close() return concordance_percentage
[docs]def printStatistics(completion, concordance, tpedSamples, oldSamples, prefix): """Print the statistics in a file. :param completion: the completion of each duplicated samples. :param concordance: the concordance of each duplicated samples. :param tpedSamples: the updated position of the samples in the tped containing only duplicated samples. :param oldSamples: the original duplicated sample positions. :param prefix: the prefix of all the files. :type completion: :py:class:`numpy.array` :type concordance: dict :type tpedSamples: dict :type oldSamples: dict :type prefix: str :returns: the completion for each duplicated samples, as a :py:class:`numpy.array`. Prints the statistics (completion of each samples and pairwise concordance between duplicated samples) in a file (``prefix.summary``). """ # Compute the completion percentage on none zero values none_zero_indexes = np.where(completion[1] != 0) completionPercentage = np.zeros(len(completion[0]), dtype=float) completionPercentage[none_zero_indexes] = np.true_divide( completion[0, none_zero_indexes], completion[1, none_zero_indexes], ) # The output file containing the summary statistics (for each of the # duplicated samples, print the mean concordance and the completion). outputFile = None try: outputFile = open(prefix + ".summary", "w") except IOError: msg = "%(prefix)s.summary: can't write file" % locals() raise ProgramError(msg) print >>outputFile, "\t".join(["origIndex", "dupIndex", "famID", "indID", "% completion", "completion", "mean concordance"]) for sampleID, indexes in tpedSamples.iteritems(): for i, index in enumerate(indexes): # The indexes toPrint = [str(oldSamples[sampleID][i]+1), str(index+1)] # The samples toPrint.extend(list(sampleID)) # The completion toPrint.append("%.8f" % completionPercentage[index]) toPrint.append("%d/%d" % (completion[0][index], completion[1][index])) # The concordance (not on total values = 0) indexToKeep = list(set(range(len(indexes))) - set([i])) values = np.ravel( np.asarray(concordance[sampleID][0][i, indexToKeep]) ) total_values = np.ravel( np.asarray(concordance[sampleID][1][i, indexToKeep]) ) currConcordance = np.zeros(len(indexToKeep), dtype=float) none_zero_indexes = np.where(total_values != 0) currConcordance[none_zero_indexes] = np.true_divide( values[none_zero_indexes], total_values[none_zero_indexes], ) currConcordance = np.mean(currConcordance) toPrint.append("%.8f" % currConcordance) print >>outputFile, "\t".join(toPrint) # Closing the output file outputFile.close() return completionPercentage
[docs]def computeStatistics(tped, tfam, samples, oldSamples, prefix): """Computes the completion and concordance of each samples. :param tped: the ``tped`` containing duplicated samples. :param tfam: the ``tfam`` containing duplicated samples. :param samples: the updated position of the samples in the tped containing only duplicated samples. :param oldSamples: the original duplicated sample positions. :param prefix: the prefix of all the files. :type tped: :py:class:`numpy.array` :type tfam: :py:class:`numpy.array` :type samples: dict :type oldSamples: dict :type prefix: str :returns: a tuple containing the completion (:py:class:`numpy.array`) as first element, and the concordance (:py:class:`dict`) as last element. Reads the ``tped`` file and compute the completion for each duplicated samples and the pairwise concordance between duplicated samples. .. note:: The completion and concordance computation excludes a markers if it's on chromosome 24 and if the sample is a female. .. note:: A missing genotype is encoded by ``0``. .. note:: No percentage is computed here, only the numbers. Percentages are computing in other functions: :py:func:`printStatistics`, for completion, and :py:func:`printConcordance`, for concordance. **Completion** Computes the completion of none zero values (where all genotypes of at least one duplicated sample are no call [*i.e.* ``0``]). The completion of sample :math:`i` (*i.e.* :math:`Comp_i`) is the number of genotypes that have a call divided by the total number of genotypes (the set :math:`G_i`): .. math:: Comp_i = \\frac{||g \\in G_i\\textrm{ where }g \\neq 0||}{||G_i||} .. note:: We consider a genotype as being missing if the sample is a male and if a marker on chromosome 23 or 24 is heterozygous. **Concordance** Computes the pairwise concordance between duplicated samples. For each marker, if both genotypes are not missing, we add one to the total number of compared markers. If both genotypes are the same, we add one to the number of concordant calls. We write the observed genotype difference in the file ``prefix.diff``. The concordance between sample :math:`i` and :math:`j` (*i.e.* :math:`Concordance_{i,j}`) is the number of genotypes that are equal divided by the total number of genotypes (excluding the no calls): .. math:: Concordance_{i,j} = \\frac{ ||g \\in G_i \\cup G_j \\textrm{ where } g_i = g_j \\neq 0|| }{ ||g \\in G_i \\cup G_j \\textrm{ where } g \\neq 0|| } """ # The diff file containing the genotype differences between a pair of # duplicated samples diffOutput = None try: diffOutput = open(prefix + ".diff", "w") except IOError: msg = "%(prefix)s.diff: can't write file" % locals() raise ProgramError(msg) print >>diffOutput, "\t".join(["name", "famID", "indID", "dupIndex_1", "dupIndex_2", "genotype_1", "genotype_2"]) # The completion data type completion = np.array([[0 for i in xrange(len(tped[0][4:]))], [0 for i in xrange(len(tped[0][4:]))]]) # The concordance data type concordance = {} for sampleID in samples.keys(): nbDup = len(samples[sampleID]) concordance[sampleID] = [ np.asmatrix(np.zeros((nbDup, nbDup), dtype=int)), np.asmatrix(np.zeros((nbDup, nbDup), dtype=int)), ] ##################################################################### # Add options for concordance only for hetero SNPs... (all samples) # ##################################################################### for row in tped: genotype = row[4:] chromosome = row[0] snpName = row[1] for key, indexes in samples.iteritems(): for i, index in enumerate(indexes): sex = tfam[oldSamples[key][i]][4] genotype1 = set(genotype[index].split(" ")) # Updating the completion if not ((chromosome == "24") and (sex == "2")): if (sex == "1") and (chromosome in ["23", "24"]): if ("0" not in genotype1) and (len(genotype1) != 2): completion[0][index] += 1 elif "0" not in genotype1: completion[0][index] += 1 completion[1][index] += 1 # Updating the concordance for j in xrange(i + 1, len(indexes)): genotype2 = set(genotype[indexes[j]].split(" ")) # This is for man on chr 23 and 24, if heterozygous!!! # # if (sex == "1") and (chromosome in ["23", "24"]): # if ("0" not in genotype1) and ("0" not in genotype2): # if (len(genotype1) != 2) and (len(genotype2) != 2): # concordance[key][1][i,j] += 1 # concordance[key][1][j,i] += 1 # if genotype1 == genotype2: # concordance[key][0][i,j] += 1 # concordance[key][0][j,i] += 1 # else: # # Print to diff file # toPrint = [snpName, key[0], key[1], # str(index+1), # str(indexes[j]+1)] # toPrint.append(" ".join(genotype1)) # if len(genotype1) == 1: # toPrint[-1] += " %s" % toPrint[-1] # toPrint.append(" ".join(genotype2)) # if len(genotype2) == 1: # toPrint[-1] += " %s" % toPrint[-1] # print >>diffOutput, "\t".join(toPrint) # # elif ("0" not in genotype1) and ("0" not in genotype2): if ("0" not in genotype1) and ("0" not in genotype2): # Both have calls concordance[key][1][i, j] += 1 concordance[key][1][j, i] += 1 if genotype1 == genotype2: concordance[key][0][i, j] += 1 concordance[key][0][j, i] += 1 else: # Print to diff file toPrint = [snpName, key[0], key[1], str(index+1), str(indexes[j]+1)] toPrint.append(" ".join(genotype1)) if len(genotype1) == 1: toPrint[-1] += " %s" % toPrint[-1] toPrint.append(" ".join(genotype2)) if len(genotype2) == 1: toPrint[-1] += " %s" % toPrint[-1] print >>diffOutput, "\t".join(toPrint) diffOutput.close() for key in concordance.iterkeys(): for i in range(len(concordance[key][0])): concordance[key][0][i, i] = 1 concordance[key][1][i, i] = 1 return completion, concordance
[docs]def processTPED(uniqueSamples, duplicatedSamples, fileName, prefix): """Process the TPED file. :param uniqueSamples: the position of unique samples. :param duplicatedSamples: the position of duplicated samples. :param fileName: the name of the file. :param prefix: the prefix of all the files. :type uniqueSamples: dict :type duplicatedSamples: collections.defaultdict :type fileName: str :type prefix: str :returns: a tuple containing the ``tped`` (:py:class:`numpy.array`) as first element, and the updated positions of the duplicated samples (:py:class:`dict`) Reads the entire ``tped`` and prints another one containing only unique samples (``prefix.unique_samples.tped``). It then creates a :py:class:`numpy.array` containing the duplicated samples. """ # Getting the indexes uI = sorted(uniqueSamples.values()) dI = [] for item in duplicatedSamples.itervalues(): dI.extend(item) dI.sort() tped = [] outputFile = None try: outputFile = open(prefix + ".unique_samples.tped", "w") except IOError: msg = "%s: can't write to file" % prefix + ".unique_samples.tped" raise ProgramError with open(fileName, 'r') as inputFile: for line in inputFile: row = line.rstrip("\r\n").split("\t") snpInfo = row[:4] genotype = row[4:] # Printing the new TPED file (unique samples only) print >>outputFile, "\t".join(snpInfo + [genotype[i] for i in uI]) # Saving the TPED file (duplicated samples only) tped.append(tuple(snpInfo + [genotype[i] for i in dI])) # Closing the output file outputFile.close() # Updating the positions updatedSamples = {} for sampleID in duplicatedSamples: positions = duplicatedSamples[sampleID] newPositions = range(len(positions)) for i, pos in enumerate(positions): newPositions[i] = dI.index(pos) updatedSamples[sampleID] = newPositions tped = np.array(tped) return tped, updatedSamples
[docs]def printUniqueTFAM(tfam, samples, prefix): """Prints a new TFAM with only unique samples. :param tfam: a representation of a TFAM file. :param samples: the position of the samples :param prefix: the prefix of the output file name :type tfam: list :type samples: dict :type prefix: str """ fileName = prefix + ".unique_samples.tfam" try: with open(fileName, "w") as outputFile: for i in sorted(samples.values()): print >>outputFile, "\t".join(tfam[i]) except IOError: msg = "%(fileName)s: no such file" raise ProgramError(msg)
[docs]def findDuplicates(tfam): """Finds the duplicates in a TFAM. :param tfam: representation of a ``tfam`` file. :type tfam: list :returns: two :py:class:`dict`, containing unique and duplicated samples position. """ uSamples = {} dSamples = defaultdict(list) for i, row in enumerate(tfam): sampleID = tuple(row[:2]) if sampleID not in uSamples: # This is the first time we see this sample uSamples[sampleID] = i else: # We have seen this sample at least once... if sampleID not in dSamples: # This is the second time we see this sample... dSamples[sampleID].extend([uSamples[sampleID], i]) else: # We have seen this sample multiple times dSamples[sampleID].append(i) # Removing the duplicates from the unique samples for sampleID in dSamples.iterkeys(): if sampleID in uSamples: del uSamples[sampleID] return uSamples, dSamples
[docs]def readTFAM(fileName): """Reads the TFAM file. :param fileName: the name of the ``tfam`` file. :type fileName: str :returns: a list of tuples, representing the ``tfam`` file. """ # Saving the TFAM file tfam = None with open(fileName, 'r') as inputFile: tfam = [ tuple(i.rstrip("\r\n").split("\t")) for i in inputFile.readlines() ] tfam = np.array(tfam) return tfam
[docs]def checkArgs(args): """Checks the arguments and options. :param args: a :py:class:`argparse.Namespace` object containing the options of the program. :type args: :py:class:`argparse.Namespace` :returns: ``True`` if everything was OK. If there is a problem with an option, an exception is raised using the :py:class:`ProgramError` class, a message is printed to the :class:`sys.stderr` and the program exists with code 1. """ # Checking the input files for suffix in [".tped", ".tfam"]: fileName = args.tfile + suffix if not os.path.isfile(fileName): msg = "%(fileName)s: no such file" % locals() raise ProgramError(msg) # Checking the concordance threshold if ((args.sample_concordance_threshold < 0) or (args.sample_concordance_threshold > 1)): msg = "sample-concordance-threshold: must be between 0 and 1 " \ "(not %f)" % args.sample_concordance_threshold raise ProgramError(msg) # Checking the completion threshold if ((args.sample_completion_threshold < 0) or (args.sample_completion_threshold > 1)): msg = "sample-completion-threshold: must be between 0 and 1 " \ "(not %f)" % args.sample_completion_threshold raise ProgramError(msg) return True
[docs]def parseArgs(argString=None): # pragma: no cover """Parses the command line options and arguments. :param argString: the options :type argString: list :returns: A :py:class:`argparse.Namespace` object created by the :py:mod:`argparse` module. It contains the values of the different options. ================================== ====== ================================= Options Type Description ================================== ====== ================================= ``--tfile`` string The input file prefix (of type ``tfile``). ``--sample-completion-threshold`` float The completion threshold. ``--sample-concordance-threshold`` float The concordance threshold. ``--out`` string The prefix of the output files. ================================== ====== ================================= .. note:: No option check is done here (except for the one automatically done by argparse). Those need to be done elsewhere (see :py:func:`checkArgs`). """ args = None if argString is None: args = parser.parse_args() else: args = parser.parse_args(argString) return args
[docs]class ProgramError(Exception): """An :py:class:`Exception` raised in case of a problem. :param msg: the message to print to the user before exiting. :type msg: str """ def __init__(self, msg): """Construction of the :py:class:`ProgramError` class. :param msg: the message to print to the user :type msg: str """ self.message = str(msg) def __str__(self): return self.message
# The parser object pretty_name = "Duplicated samples" desc = "Extracts and merges duplicated samples." long_desc = ("The script evaluates concordance and completion rate. If the " "thresholds are met, the script merges and completes the " "genotypes.") parser = argparse.ArgumentParser(description=desc) parser.add_argument("-v", "--version", action="version", version="pyGenClean version {}".format(__version__)) # The INPUT files group = parser.add_argument_group("Input File") group.add_argument("--tfile", type=str, metavar="FILE", required=True, help=("The input file prefix (will find the tped and tfam " "file by appending the prefix to .tped and .tfam, " "respectively.) The duplicated samples should have " "the same identification numbers (both family and " "individual ids.)")) # The options group = parser.add_argument_group("Options") group.add_argument("--sample-completion-threshold", type=float, metavar="FLOAT", default=0.9, help=("The completion threshold to consider a replicate " "when choosing the best replicates and for creating " "the composite samples. [default: %(default).1f]")) group.add_argument("--sample-concordance-threshold", type=float, metavar="FLOAT", default=0.97, help=("The concordance threshold to consider a replicate " "when choosing the best replicates and for creating " "the composite samples. [default: %(default).2f]")) # The OUTPUT files group = parser.add_argument_group("Output File") group.add_argument("--out", type=str, metavar="FILE", default="dup_samples", help=("The prefix of the output files. [default: " "%(default)s]"))
[docs]def safe_main(): """A safe version of the main function (that catches ProgramError).""" try: main() except KeyboardInterrupt: logger.info("Cancelled by user") sys.exit(0) except ProgramError as e: logger.error(e.message) parser.error(e.message)
if __name__ == "__main__": safe_main()