#!/usr/bin/env python2.7
# This file is part of pyGenClean.
#
# pyGenClean is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# pyGenClean is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# pyGenClean. If not, see <http://www.gnu.org/licenses/>.
import os
import sys
import math
import shutil
import random
import logging
import StringIO
import argparse
import subprocess
from collections import defaultdict
import numpy as np
from .. import __version__
from ..PlinkUtils import createRowFromPlinkSpacedOutput
logger = logging.getLogger("duplicated_snps")
[docs]def main(argString=None):
"""The main function of the module..
Here are the steps for duplicated samples:
1. Prints the options.
2. Reads the ``map`` file to gather marker's position
(:py:func:`readMAP`).
3. Reads the ``tfam`` file (:py:func:`readTFAM`).
4. Finds the unique markers using the ``map`` file
(:py:func:`findUniques`).
5. Process the ``tped`` file. Write a file containing unique markers in
``prefix.unique_snps`` (``tfam`` and ``tped``). Keep in memory
information about the duplicated markers (``tped``)
(:py:func:`processTPED`).
6. If there are no duplicated markers, the file ``prefix.unique_snps``
(``tped`` and ``tfam``) are copied to ``prefix.final``.
7. If there are duplicated markers, print a ``tped`` and ``tfam`` file
containing the duplicated markers
(:py:func:`printDuplicatedTPEDandTFAM`).
8. Computes the frequency of the duplicated markers (using Plink) and read
the output file (:py:func:`computeFrequency`).
9. Computes the concordance and pairwise completion of each of the
duplicated markers (:py:func:`computeStatistics`).
10. Prints the problematic duplicated markers with a file containing the
summary of the statistics (completion and pairwise concordance)
(:py:func:`printProblems`).
11. Print the pairwise concordance in a file (matrices)
(:py:func:`printConcordance`).
12. Choose the best duplicated markers using concordance and completion
(:py:func:`chooseBestSnps`).
13. Completes the chosen markers with the remaining duplicated markers
(:py:func:`createAndCleanTPED`).
14. Creates the final ``tped`` file, containing the unique markers, the
chosen duplicated markers that were completed, and the problematic
duplicated markers (for further analysis). This set excludes markers
that were used for completing the chosen ones
(:py:func:`createFinalTPEDandTFAM`).
"""
# Getting and checking the options
args = parseArgs(argString)
checkArgs(args)
logger.info("Options used:")
for key, value in vars(args).iteritems():
logger.info(" --{} {}".format(key.replace("_", "-"), value))
# Reading the map file
logger.info("Reading MAP file")
mapF = readMAP(args.tfile + ".map", args.out)
# Reading the tfam file
logger.info("Reading TFAM file")
tfam = readTFAM(args.tfile + ".tfam")
# Find unique snps
logger.info("Finding unique SNPs")
uniqueSNPs = findUniques(mapF)
# Process the TPED file
logger.info("Reading TPED file")
tped, duplicatedSNPs = processTPED(uniqueSNPs, mapF,
args.tfile + ".tped",
args.tfile + ".tfam", args.out)
if len(tped) == 0:
logger.info("There are no duplicated SNPs")
logger.info(" - Creating final TFAM")
# Copying the files
# The TFAM
try:
shutil.copy(args.out + ".unique_snps.tfam",
args.out + ".final.tfam")
except IOError:
msg = "%s.unique_snps.tfam: can't copy file to " \
"%s.final.tfam" % (args.out, args.out)
raise ProgramError(msg)
# The TPED
logger.info(" - Creating final TPED")
try:
shutil.copy(args.out + ".unique_snps.tped",
args.out + ".final.tped")
except IOError:
msg = "%s.unique_snps.tped: can't copy file to " \
"%s.final.tped" % (args.out, args.out)
raise ProgramError(msg)
else:
# We print the TPED and TFAM for the duplicated SNPs
logger.info("Printing duplicated SNPs TPED and TFAM files")
printDuplicatedTPEDandTFAM(tped, args.tfile + ".tfam", args.out)
# Computing the frequency of the duplicated SNPs
logger.info("Computing duplicated SNPs' frequencies")
dupSNPsFreq = computeFrequency(args.out + ".duplicated_snps",
args.out)
# Compute statistics
logger.info("Computing concordance and completion of duplicated SNPs")
completion, concordance = computeStatistics(tped, tfam, duplicatedSNPs)
# Print the statistics
logger.info("Printing duplicated SNPs summary file and finding errors "
"within duplicates")
snpsToComplete = printProblems(completion, concordance, tped,
duplicatedSNPs, dupSNPsFreq, args.out,
args.frequency_difference)
# Print the concordance file
logger.info("Printing concordance file")
printConcordance(concordance, args.out, tped, duplicatedSNPs)
# Choose the best SNP
logger.info("Choosing best SNP for each duplicates")
chosenSNPs, comp, conc = chooseBestSnps(tped, duplicatedSNPs,
completion, concordance,
args.out)
# Complete the SNPs
logger.info("Completing chosen duplicates (removing discordant "
"genotypes)")
newTPED, snpToRemove = createAndCleanTPED(
tped,
tfam,
duplicatedSNPs,
args.out,
chosenSNPs,
comp,
conc,
snpsToComplete,
args.tfile + ".tfam",
args.snp_completion_threshold,
args.snp_concordance_threshold,
)
# Creates the final tped
logger.info("Writing final TPED and TFAM file")
createFinalTPEDandTFAM(newTPED, args.out + ".unique_snps",
args.out, snpToRemove)
[docs]def createFinalTPEDandTFAM(tped, toReadPrefix, prefix, snpToRemove):
"""Creates the final TPED and TFAM.
:param tped: a representation of the ``tped`` of duplicated markers.
:param toReadPrefix: the prefix of the unique files.
:param prefix: the prefix of the output files.
:param snpToRemove: the markers to remove.
:type tped: numpy.array
:type toReadPrefix: str
:type prefix: str
:type snpToRemove: set
Starts by copying the unique markers' ``tfam`` file to
``prefix.final.tfam``. Then, it copies the unique markers' ``tped`` file,
in which the chosen markers will be appended.
The final data set will include the unique markers, the chosen markers
which were completed, and the problematic duplicated markers (for further
analysis). The markers that were used to complete the chosen ones are not
present in the final data set.
"""
# First, copying the tfam
try:
shutil.copy(toReadPrefix + ".tfam", prefix + ".final.tfam")
except IOError:
msg = "%(toReadPrefix)s.tfam: can't copy file to " \
"%(prefix)s.final.tfam" % locals()
raise ProgramError(msg)
# Next, copy the tped, and append at the end
try:
shutil.copy(toReadPrefix + ".tped", prefix + ".final.tped")
except IOError:
msg = "%(toReadPrefix)s.tped: can't copy fil to " \
"%(prefix)s.final.tped" % locals()
raise ProgramError(msg)
tpedFile = None
try:
tpedFile = open(prefix + ".final.tped", "a")
except IOError:
msg = "%(prefix)s.final.tped: can't append to file" % locals()
raise ProgramError(msg)
for i, row in enumerate(tped):
if i not in snpToRemove:
print >>tpedFile, "\t".join(row)
tpedFile.close()
[docs]def createAndCleanTPED(tped, tfam, snps, prefix, chosenSNPs, completion,
concordance, snpsToComplete, tfamFileName, completionT,
concordanceT):
"""Complete a TPED for duplicated SNPs.
:param tped: a representation of the ``tped`` of duplicated markers.
:param tfam: a representation of the ``tfam``.
:param snps: the position of duplicated markers in the ``tped``.
:param prefix: the prefix of the output files.
:param chosenSNPs: the markers that were chosen for completion (including
problems).
:param completion: the completion of each of the duplicated markers.
:param concordance: the pairwise concordance of the duplicated markers.
:param snpsToComplete: the markers that will be completed (excluding
problems).
:param tfamFileName: the name of the original ``tfam`` file.
:param completionT: the completion threshold.
:param concordanceT: the concordance threshold.
:type tped: numpy.array
:type tfam: list
:type snps: dict
:type prefix: str
:type chosenSNPs: dict
:type completion: numpy.array
:type concordance: dict
:type snpsToComplete: set
:type tfamFileName: str
:type completionT: float
:type concordanceT: float
:returns: a tuple containing the new ``tped`` after completion
(:py:class:`numpy.array` as the first element, and the index of
the markers that will need to be rid of (:py:class:`set`) as the
last element.
It creates three different files:
* ``prefix.zeroed_out``: contains information about markers and samples
where the genotyped was zeroed out.
* ``prefix.not_good_enough``: contains information about markers that were
not good enough to help in completing the
chosen markers (because of concordance or
completion).
* ``prefix.removed_duplicates``: the list of markers that where used for
completing the chosen one, hence they will
be removed from the final data
set.
Cycling through every genotypes of every samples of every duplicated
markers, checks if the genotypes are all the same. If the chosen one was
not called, but the other ones were, then we complete the chosen one with
the genotypes for the others (assuming that they are all the same). If
there is a difference between the genotypes, it is zeroed out for the
chosen marker.
"""
zeroedOutFile = None
try:
zeroedOutFile = open(prefix + ".zeroed_out", "w")
except IOError:
msg = "%(prefix).zeroed_out: can't write file" % locals()
raise ProgramError(msg)
print >>zeroedOutFile, "\t".join(["famID", "indID", "snpID"])
notGoodEnoughFile = None
try:
notGoodEnoughFile = open(prefix + ".not_good_enough", "w")
except IOError:
msg = "%(prefix)s.not_good_enough: can't write file" % locals()
raise ProgramError(msg)
print >>notGoodEnoughFile, "\t".join(["name", "reason"])
removedFile = None
try:
removedFile = open(prefix + ".removed_duplicates", "w")
except IOError:
msg = "%(prefix)s.removed_duplicates: can't write file" % locals()
raise ProgramError(msg)
notGoodEnoughSnps = set()
# Split the tped in 'snpInfo' and 'genotypes'
snpInfo = tped[:, :4]
genotypes = tped[:, 4:]
# The sed of index we want to get rid of at the end
getRidOfIndex = set()
for snpID, indexes in snps.iteritems():
if snpID not in snpsToComplete:
# We don't want to complete this SNP, so we continue to next SNP
continue
# Getting the completion
completionToRemove = set(
np.where(completion[indexes] < completionT)[0]
)
for k in completionToRemove:
notGoodEnoughSnps.add((snpInfo[indexes][k, 1], "completion"))
# Getting the concordance
concordanceToRemove = set(
np.where(concordance[snpID] < concordanceT)[0]
)
for k in concordanceToRemove:
notGoodEnoughSnps.add((snpInfo[indexes][k, 1], "concordance"))
# These will be the indexes to remove
indexesToRemove = set()
for index in completionToRemove | concordanceToRemove:
indexesToRemove.add(indexes[index])
# These are the indexes to keep
indexesToKeep = []
for index in indexes:
if index not in indexesToRemove:
indexesToKeep.append(index)
# Getting the chosen SNP
chosenOne = chosenSNPs[snpID]
if chosenOne not in set(indexesToKeep):
# The chosen SNP is not a good SNP, so we go to next SNP
logger.warning(" - {} chosen but not good enough".format(
snpInfo[chosenOne, 1],
))
continue
# Now cycling through the genotypes
nbSamples = genotypes.shape[1]
for sampleIndex in xrange(nbSamples):
# We need to remove the no call and keep the unique genotypes
curGenotypes = genotypes[indexesToKeep, sampleIndex]
cleanedCurGenotypes = curGenotypes[
np.where(curGenotypes != "0 0")
]
uniqueCleanedCurGenotypes = np.unique(cleanedCurGenotypes)
# Checking the number of unique genotypes
toComplete = False
if len(uniqueCleanedCurGenotypes) > 1:
# There are more than one unique genotype (except 0 0)
# len = 0 means all were 0 0
# len = 1 means they are all the same
# len > 1 means discordance (might need to flip)
# Just need to check the order of the alleles
possibleAlleles = [
set() for k in xrange(len(uniqueCleanedCurGenotypes))
]
for k, geno in enumerate(uniqueCleanedCurGenotypes):
possibleAlleles[k] |= set(geno.split(" "))
allEqual = True
for k in xrange(len(possibleAlleles)):
for l in xrange(k+1, len(possibleAlleles)):
if possibleAlleles[k] != possibleAlleles[l]:
allEqual = False
if not allEqual:
# The genotypes are not all equal, we set the chosen
# genotype to null (0 0)
tped[chosenOne, sampleIndex+4] = "0 0"
print >>zeroedOutFile, "\t".join([tfam[sampleIndex, 0],
tfam[sampleIndex, 1],
snpInfo[chosenOne, 1]])
elif genotypes[chosenOne, sampleIndex] == "0 0":
toComplete = True
elif ((len(uniqueCleanedCurGenotypes) == 1) and
(genotypes[chosenOne, sampleIndex] == "0 0")):
toComplete = True
if toComplete:
# We complete the current individual
tped[chosenOne, sampleIndex+4] = uniqueCleanedCurGenotypes[0]
# We keep only the chose one
for index in indexes:
if index != chosenOne:
getRidOfIndex.add(index)
print >>removedFile, snpInfo[index, 1]
# Writing the not good enough file
for item in notGoodEnoughSnps:
print >>notGoodEnoughFile, "\t".join(item)
# Closing the output files
zeroedOutFile.close()
notGoodEnoughFile.close()
# Printing the chosen file
try:
shutil.copy(tfamFileName, prefix + ".chosen_snps.tfam")
except IOError:
msg = "%(tfamFileName)s: can't copy file to " \
"%(prefix)s.chosen_snps.tfam" % locals()
raise ProgramError(msg)
chosenFile = None
try:
chosenFile = open(prefix + ".chosen_snps.tped", "w")
except IOError:
msg = "%(prefix)s.chosen_snps.tped: can't write file" % locals()
raise ProgramError(msg)
for chosenOne in chosenSNPs.itervalues():
snpID = (tped[chosenOne, 0], tped[chosenOne, 3])
if snpID in snpsToComplete:
print >>chosenFile, "\t".join(tped[chosenOne])
chosenFile.close()
return tped, getRidOfIndex
[docs]def chooseBestSnps(tped, snps, trueCompletion, trueConcordance, prefix):
"""Choose the best duplicates according to the completion and concordance.
:param tped: a representation of the ``tped`` of duplicated markers.
:param snps: the position of the duplicated markers in the ``tped``.
:param trueCompletion: the completion of each markers.
:param trueConcordance: the pairwise concordance of each markers.
:param prefix: the prefix of the output files.
:type tped: numpy.array
:type snps: dict
:type trueCompletion: numpy.array
:type trueConcordance: dict
:type prefix: str
:returns: a tuple containing the chosen indexes (:py:class:`dict`) as the
first element, the completion (:py:class:`numpy.array`) as the
second element, and the concordance (:py:class:`dict`) as last
element.
It creates two output files: ``prefix.chosen_snps.info`` and
``prefix.not_chosen_snps.info``. The first one contains the markers that
were chosen for completion, and the second one, the markers that weren't.
It starts by computing the completion of each markers (dividing the number
of calls divided by the total number of genotypes). Then, for each of the
duplicated markers, we choose the best one according to completion and
concordance (see explanation in
:py:func:`DupSamples.duplicated_samples.chooseBestDuplicates` for more
details).
"""
# The output files
chosenFile = None
try:
chosenFile = open(prefix + ".chosen_snps.info", "w")
except IOError:
msg = "%(prefix)s.chosen_snps.info: can't write file" % locals()
raise ProgramError(msg)
excludedFile = None
try:
excludedFile = open(prefix + ".not_chosen_snps.info", "w")
except IOError:
msg = "%(prefix)s.not_chosen_snps.info: can't write file" % locals()
raise ProgramError(msg)
# Computing the completion
completion = np.true_divide(trueCompletion[0], trueCompletion[1])
# For each duplicated SNPs
chosenIndexes = {}
snpConcordance = {}
for snp, indexes in snps.iteritems():
# Getting the completion for those duplicated SNPs
currCompletion = completion[indexes]
# Sorting those completion
sortedCompletionInsexes = np.argsort(currCompletion)
# Getting the concordance
concordance = np.true_divide(trueConcordance[snp][0],
trueConcordance[snp][1])
currConcordance = [[] for i in xrange(len(indexes))]
for i in xrange(len(indexes)):
indexToKeep = list(set(range(len(indexes))) - set([i]))
currConcordance[i] = np.mean(concordance[i, indexToKeep])
currConcordance = np.array(currConcordance)
if snp not in snpConcordance:
snpConcordance[snp] = currConcordance
# Sorting the concordance
sortedConcordanceIndexes = np.argsort(currConcordance)
# Trying to find the best duplicate to keep
nbToCheck = 1
chosenIndex = None
while nbToCheck <= len(indexes):
# Getting the `nbToCheck` best value (higher to lower)
completionValue = currCompletion[
sortedCompletionInsexes[nbToCheck*-1]
]
concordanceValue = currConcordance[
sortedConcordanceIndexes[nbToCheck*-1]
]
# Getting the indexes to consider
completionToConsider = set(
np.where(currCompletion >= completionValue)[0]
)
concordanceToConsider = set(
np.where(currConcordance >= concordanceValue)[0]
)
# Getting the intersection of the indexes
toConsider = concordanceToConsider & completionToConsider
if len(toConsider) >= 1:
chosenIndex = random.choice(list(toConsider))
break
nbToCheck += 1
if chosenIndex is None:
msg = "Could not choose the best snp ID"
raise ProgramError(msg)
# Printing the chosen SNPs
print >>chosenFile, tped[indexes[chosenIndex], 1]
# Printing the excluded SNPs
for i, index in enumerate(indexes):
if i != chosenIndex:
print >>excludedFile, tped[index, 1]
chosenIndexes[snp] = indexes[chosenIndex]
# Closing the output files
chosenFile.close()
excludedFile.close()
return chosenIndexes, completion, snpConcordance
[docs]def computeFrequency(prefix, outPrefix):
"""Computes the frequency of the SNPs using Plink.
:param prefix: the prefix of the input files.
:param outPrefix: the prefix of the output files.
:type prefix: str
:type outPrefix: str
:returns: a :py:class:`dict` containing the frequency of each marker.
Start by computing the frequency of all markers using Plink. Then, it reads
the output file, and saves the frequency and allele information.
"""
# The plink command
plinkCommand = ["plink", "--noweb", "--tfile", prefix, "--freq", "--out",
outPrefix + ".duplicated_snps"]
runCommand(plinkCommand)
# Reading the frequency file
snpFreq = {}
try:
with open(outPrefix + ".duplicated_snps.frq", "r") as inputFile:
headerIndex = None
for i, line in enumerate(inputFile):
row = createRowFromPlinkSpacedOutput(line)
if i == 0:
# This is the header
headerIndex = dict([
(row[j], j) for j in xrange(len(row))
])
# Checking the column titles
for columnTitle in ["SNP", "MAF", "A1", "A2"]:
if columnTitle not in headerIndex:
msg = "%(outPrefix)s.duplicated_snps.frq: no " \
"column called %(columnTitle)s" % locals()
raise ProgramError(msg)
else:
# This is data
snpName = row[headerIndex["SNP"]]
maf = row[headerIndex["MAF"]]
a1 = row[headerIndex["A1"]]
a2 = row[headerIndex["A2"]]
try:
if maf == "NA":
maf = 0.0
else:
maf = float(maf)
except ValueError:
msg = "%(outPrefix)s.duplicated_snps.frq: %(maf)s: " \
"not a valid MAF" % locals()
raise ProgramError(msg)
snpFreq[snpName] = (maf, (a1, a2))
except IOError:
msg = "%(outPrefix)s.duplicated_snps.freq: no such file" % locals()
raise ProgramError(msg)
return snpFreq
[docs]def runCommand(command):
"""Run the command in Plink.
:param command: the command to run.
:type command: list
Tries to run a command using :py:mod:`subprocess`.
"""
output = None
try:
output = subprocess.check_output(command,
stderr=subprocess.STDOUT, shell=False)
except subprocess.CalledProcessError:
msg = "plink: couldn't run plink\n"
msg += " ".join(command)
raise ProgramError(msg)
[docs]def printDuplicatedTPEDandTFAM(tped, tfamFileName, outPrefix):
"""Print the duplicated SNPs TPED and TFAM.
:param tped: a representation of the ``tped`` of duplicated markers.
:param tfamFileName: the name of the original ``tfam`` file.
:param outPrefix: the output prefix.
:type tped: numpy.array
:type tfamFileName: str
:type outPrefix: str
First, it copies the original ``tfam`` into
``outPrefix.duplicated_snps.tfam``. Then, it prints the ``tped`` of
duplicated markers in ``outPrefix.duplicated_snps.tped``.
"""
# Copying the tfam file
try:
shutil.copy(tfamFileName, outPrefix + ".duplicated_snps.tfam")
except IOError:
msg = "%(tfamFileName)s: can't copy file to " \
"%(outPrefix)s.duplicated_snps.tfam" % locals()
raise ProgramError(msg)
# Writing the tped
tpedFile = None
try:
tpedFile = open(outPrefix + ".duplicated_snps.tped", "w")
except IOError:
msg = "%(outPrefix)s.duplicated_snps.tped: can't write " \
"file" % locals()
raise ProgramError(msg)
for row in tped:
print >>tpedFile, "\t".join(row)
tpedFile.close()
[docs]def printConcordance(concordance, prefix, tped, snps):
"""Print the concordance.
:param concordance: the concordance.
:param prefix: the prefix if the output files.
:param tped: a representation of the ``tped`` of duplicated markers.
:param snps: the position of the duplicated markers in the ``tped``.
:type concordance: dict
:type prefix: str
:type tped: numpy.array
:type snps: dict
Prints the concordance in a file, in the format of a matrix. For each
duplicated markers, the first line (starting with the `#` signs) contains
the name of all the markers in the duplicated markers set. Then a :math:`N
\\times N` matrix is printed to file (where :math:`N` is the number of
markers in the duplicated marker list), containing the pairwise
concordance.
"""
outFile = None
try:
outFile = open(prefix + ".concordance", "w")
except IOError:
msg = "%s: can't write file" % prefix + ".concordance"
raise ProgramError(msg)
for snpID in concordance.iterkeys():
print >>outFile, "#" + "\t".join(
list(snpID) + list(tped[snps[snpID], 1])
)
# Doing the division
true_concordance = np.true_divide(concordance[snpID][0],
concordance[snpID][1])
output = StringIO.StringIO()
np.savetxt(output, true_concordance, delimiter="\t", fmt="%.8f")
print >>outFile, output.getvalue().rstrip("\r\n")
outFile.close()
[docs]def printProblems(completion, concordance, tped, snps, frequencies, prefix,
diffFreq):
"""Print the statistics.
:param completion: the completion of each duplicated markers.
:param concordance: the pairwise concordance between duplicated markers.
:param tped: a representation of the ``tped`` of duplicated markers.
:param snps: the positions of the duplicated markers in the ``tped``
:param frequencies: the frequency of each of the duplicated markers.
:param prefix: the prefix of the output files.
:param diffFreq: the frequency difference threshold.
:type completion: numpy.array
:type concordance: dict
:type tped: numpy.array
:type snps: dict
:type frequencies: dict
:type prefix: str
:type diffFreq: float
:returns: a :py:class:`set` containing duplicated markers to complete.
Creates a summary file (``prefix.summary``) containing information about
duplicated markers: chromosome, position, name, alleles, status, completion
percentage, completion number and mean concordance.
The frequency and the minor allele are used to be certain that two
duplicated markers are exactly the same marker (and not a tri-allelic one,
for example).
For each duplicated markers:
1. Constructs the set of available alleles for the first marker.
2. Constructs the set of available alleles for the second marker.
3. If the two sets are different, but the number of alleles is the same, we
try to flip one of the marker. If the two sets are the same, but the
number of alleles is 1, we set the status to ``homo_flip``. If the
markers are heterozygous, we set the status to ``flip``.
4. If there is a difference in the number of alleles (one is homozygous,
the other, heterozygous), and that there is on allele in common, we set
the status to ``homo_hetero``. If there are no allele in common, we try
to flip one. If the new sets have one allele in common, we set the
status to ``homo_hetero_flip``.
5. If the sets of available alleles are the same (without flip), we check
the frequency and the minor alleles. If the minor allele is different,
we set the status to ``diff_minor_allele``. If the difference in
frequencies is higher than a threshold, we set the status to
``diff_frequency``.
6. If all of the above fail, we set the status to ``problem``.
Problems are written in the ``prefix.problems`` file, and contains the
following columns: chromosome, position, name and status. This file
contains all the markers with a status, as explained above.
"""
completionPercentage = np.true_divide(completion[0], completion[1])
outSummary = None
try:
outSummary = open(prefix + ".summary", "w")
except IOError:
msg = "%s: can't write file" % prefix + ".summary"
raise ProgramError
# Prints the header of the summary file
print >>outSummary, "\t".join(["chr", "pos", "name", "alleles", "status",
"% completion", "completion",
"mean concordance"])
# The data structure containing the problems
problems = {}
for snpID, indexes in snps.iteritems():
for i, index in enumerate(indexes):
# The SNP information (chromosome and position)
toPrint = list(snpID)
# The name of the SNP
snpName = tped[index, 1]
toPrint.append(snpName)
# The frequency of the SNP
snpFreq, mafAlleles = frequencies[snpName]
# A list of the other SNP name with problems
otherSnpNameWithProblem = set()
# The alleles
alleles = set()
otherAlleles = set()
status = []
for genotype in np.unique(tped[index, 4:]):
alleles |= set(genotype.split(" "))
if "0" in alleles:
alleles.remove("0")
for j in xrange(i+1, len(indexes)):
otherIndex = indexes[j]
otherSnpName = tped[otherIndex, 1]
# The frequency of the other SNP
otherSnpFreq, otherMafAlleles = frequencies[otherSnpName]
# Checking the alleles
for genotype in np.unique(tped[otherIndex, 4:]):
otherAlleles |= set(genotype.split(" "))
if "0" in otherAlleles:
otherAlleles.remove("0")
if alleles != otherAlleles:
if len(alleles) == len(otherAlleles):
# Same number of alleles
# Try the flipped ones
otherAlleles = flipGenotype(otherAlleles)
if alleles == otherAlleles:
if len(alleles) == 1:
status.append("homo_flip")
otherSnpNameWithProblem.add(otherSnpName)
else:
status.append("flip")
otherSnpNameWithProblem.add(otherSnpName)
else:
status.append("problem")
otherSnpNameWithProblem.add(otherSnpName)
else:
# Different number of alleles
if len(alleles & otherAlleles) == 1:
status.append("homo_hetero")
otherSnpNameWithProblem.add(otherSnpName)
else:
# Try the flipped one
otherAlleles = flipGenotype(otherAlleles)
if len(alleles & otherAlleles) == 1:
status.append("homo_hetero_flip")
otherSnpNameWithProblem.add(otherSnpName)
else:
status.append("problem")
otherSnpNameWithProblem.add(otherSnpName)
else:
# The alleles are the same, so we check the frequency
if mafAlleles[0] != otherMafAlleles[0]:
# They don't have the same minor allele
status.append("diff_minor_allele")
otherSnpNameWithProblem.add(otherSnpName)
elif math.fabs(snpFreq - otherSnpFreq) > diffFreq:
# They don't have same frequency
status.append("diff_frequency")
otherSnpNameWithProblem.add(otherSnpName)
alleles = list(alleles)
alleles.sort()
if len(alleles) == 1:
alleles.append(alleles[0])
toPrint.append(" ".join(alleles))
toPrint.append(";".join(status))
# The completion
toPrint.append("%.8f" % completionPercentage[index])
toPrint.append("%d/%d" % (completion[0][index],
completion[1][index]))
# The concordance
indexToKeep = list(set(range(len(indexes))) - set([i]))
currConcordance = np.true_divide(
concordance[snpID][0][i, indexToKeep],
concordance[snpID][1][i, indexToKeep],
)
currConcordance = np.mean(currConcordance)
toPrint.append("%.8f" % currConcordance)
print >>outSummary, "\t".join(toPrint)
# Now updating the problems data structure
if len(status) != len(otherSnpNameWithProblem):
msg = "There is a problem with the problematic SNPs"
raise ProgramError(msg)
if len(status) > 0:
if snpID not in problems:
tmp = {"snpNames": {snpName}, "problems": set()}
problems[snpID] = tmp
# We have problems
problems[snpID]["snpNames"] |= otherSnpNameWithProblem
problems[snpID]["problems"] |= set(status)
outSummary.close()
outProblems = None
try:
outProblems = open(prefix + ".problems", "w")
except IOError:
msg = "%s: can't write file" % prefix + ".problems"
raise ProgramError
# Printing the header of the problem file...
print >>outProblems, "\t".join(["chr", "pos", "name", "status"])
for snpID in problems.iterkeys():
toPrint = list(snpID)
toPrint.append(";".join(list(problems[snpID]["snpNames"])))
toPrint.append(";".join(list(problems[snpID]["problems"])))
print >>outProblems, "\t".join(toPrint)
outProblems.close()
# Returning the SNPs to complete
return set(snps.keys()) - set(problems.keys())
[docs]def computeStatistics(tped, tfam, snps):
"""Computes the completion and concordance of each SNPs.
:param tped: a representation of the ``tped``.
:param tfam: a representation of the ``tfam``
:param snps: the position of the duplicated markers in the ``tped``.
:type tped: numpy.array
:type tfam: list
:type snps: dict
:returns: a tuple containing the completion of duplicated markers
(:py:class:`numpy.array`) as first element, and the concordance
(:py:class:`dict`) of duplicated markers, as last element.
A marker's completion is compute using this formula (where :math:`G_i` is
the set of genotypes for the marker :math:`i`):
.. math::
Completion_i = \\frac{||g \\in G_i \\textrm{ where } g \\neq 0||}
{||G_i||}
The pairwise concordance between duplicated markers is compute as follow
(where :math:`G_i` and :math:`G_j` are the sets of genotypes for markers
:math:`i` and :math:`j`, respectively):
.. math::
Concordance_{i,j} = \\frac{
||g \\in G_i \\cup G_j \\textrm{ where } g_i = g_j \\neq 0||
}{
||g \\in G_i \\cup G_j \\textrm{ where } g \\neq 0||
}
Hence, we only computes the numerators and denominators of the completion
and concordance, for future reference.
.. note::
When the genotypes are not comparable, the function tries to flip one
of the genotype to see if it becomes comparable.
"""
# The completion data type
completion = np.array([[0 for i in xrange(len(tped))],
[0 for i in xrange(len(tped))]])
# The concordance data type
concordance = {}
for snpID in snps.keys():
nbDup = len(snps[snpID])
concordance[snpID] = [
np.asmatrix(np.zeros((nbDup, nbDup), dtype=int)),
np.asmatrix(np.zeros((nbDup, nbDup), dtype=int))
]
# The women and the no sex
menIndex = np.where(tfam[:, 4] == "1")
womenIndex = np.where(tfam[:, 4] == "2")
noSexIndex = np.where(tfam[:, 4] == "0")
for snpID, indexes in snps.iteritems():
nbDup = len(indexes)
currGenotypes = tped[indexes, 4:]
chromosome, position = snpID
# if chromosome == "24":
# # Remove the heterozygous men
# menToRemove = getIndexOfHeteroMen(currGenotypes, menIndex)
# # Remove the women and the no sex
# currGenotypes = np.delete(currGenotypes,
# np.hstack((womenIndex, noSexIndex,
# menToRemove)), 1)
# elif chromosome == "23":
# # Remove the heterozygous men
# menToRemove = getIndexOfHeteroMen(currGenotypes, menIndex)
# # Remove the no sex
# currGenotypes = np.delete(currGenotypes,
# np.hstack((noSexIndex, menToRemove)),
# 1)
for i in xrange(nbDup):
# Compute completion here
completion[0][indexes[i]] = len(
np.where(currGenotypes[i] != "0 0")[0]
)
completion[1][indexes[i]] = len(currGenotypes[i])
for j in xrange(i+1, nbDup):
# Compute concordance here
# Removing samples with at least one null genotype
nullGenotypeIndexes = np.where(
np.any(currGenotypes[[i, j]] == "0 0", 0)
)
subGenotypes = np.delete(
currGenotypes,
nullGenotypeIndexes,
1,
)
# Finding the errors in the subseted genotypes
errorIndexes = np.where(subGenotypes[i] != subGenotypes[j])[0]
nbDiff = len(errorIndexes)
for k in errorIndexes:
# Getting the genotypes
genotype1 = set(subGenotypes[i, k].split(" "))
genotype2 = set(subGenotypes[j, k].split(" "))
# Checking for flips
if len(genotype1) == len(genotype2):
# Both have the same number of different alleles,
# so they might be flipped
genotype2 = flipGenotype(genotype2)
if genotype1 == genotype2:
# The genotypes are equivalent after the flip
nbDiff -= 1
# Updating the concordance
nbTot = len(subGenotypes[i])
concordance[snpID][0][i, j] = nbTot - nbDiff
concordance[snpID][0][j, i] = nbTot - nbDiff
if nbTot == 0:
# We will have a division by 0...
nbTot = 1
concordance[snpID][1][i, j] = nbTot
concordance[snpID][1][j, i] = nbTot
for snpID in concordance.iterkeys():
for i in range(len(concordance[snpID][0])):
concordance[snpID][0][i, i] = 1
concordance[snpID][1][i, i] = 1
return completion, concordance
[docs]def getIndexOfHeteroMen(genotypes, menIndex):
"""Get the indexes of heterozygous men.
:param genotypes: the genotypes of everybody.
:param menIndex: the indexes of the men (for the genotypes).
:type genotypes: numpy.array
:type menIndex: numpy.array
:returns: a :py:class:`numpy.array` containing the indexes of the genotypes
to remove.
Finds the mean that have a heterozygous genotype for this current marker.
Usually used on sexual chromosomes.
"""
toRemove = set()
for i in menIndex[0]:
for genotype in [set(j.split(" ")) for j in genotypes[:, i]]:
if len(genotype) != 1:
# We have an heterozygous
toRemove.add(i)
toRemove = list(toRemove)
toRemove.sort()
return (np.array(toRemove, dtype=int),)
[docs]def flipGenotype(genotype):
"""Flips a genotype.
:param genotype: the genotype to flip.
:type genotype: set
:returns: the new flipped genotype (as a :py:class:`set`)
.. testsetup::
from pyGenClean.DupSNPs.duplicated_snps import flipGenotype
.. doctest::
>>> flipGenotype({"A", "T"})
set(['A', 'T'])
>>> flipGenotype({"C", "T"})
set(['A', 'G'])
>>> flipGenotype({"T", "G"})
set(['A', 'C'])
>>> flipGenotype({"0", "0"})
Traceback (most recent call last):
...
ProgramError: 0: unkown allele
>>> flipGenotype({"A", "N"})
Traceback (most recent call last):
...
ProgramError: N: unkown allele
"""
newGenotype = set()
for allele in genotype:
if allele == "A":
newGenotype.add("T")
elif allele == "C":
newGenotype.add("G")
elif allele == "T":
newGenotype.add("A")
elif allele == "G":
newGenotype.add("C")
else:
msg = "%(allele)s: unknown allele" % locals()
raise ProgramError(msg)
return newGenotype
# def processTPED(uniqueSNPs, duplicatedSNPs, mapF, fileName, tfam, prefix):
[docs]def processTPED(uniqueSNPs, mapF, fileName, tfam, prefix):
"""Process the TPED file.
:param uniqueSNPs: the unique markers.
:param mapF: a representation of the ``map`` file.
:param fileName: the name of the ``tped`` file.
:param tfam: the name of the ``tfam`` file.
:param prefix: the prefix of all the files.
:type uniqueSNPs: dict
:type mapF: list
:type fileName: str
:type tfam: str
:type prefix: str
:returns: a tuple with the representation of the ``tped`` file
(:py:class:`numpy.array`) as first element, and the updated
position of the duplicated markers in the ``tped``
representation.
Copies the ``tfam`` file into ``prefix.unique_snps.tfam``. While reading
the ``tped`` file, creates a new one (``prefix.unique_snps.tped``)
containing only unique markers.
"""
# Copying the tfam file
try:
shutil.copy(tfam, prefix + ".unique_snps.tfam")
except IOError:
msg = "%s: can't write file" % prefix + ".unique_snps.tfam"
raise ProgramError(msg)
tped = []
updatedSNPs = defaultdict(list)
outputFile = None
try:
outputFile = open(prefix + ".unique_snps.tped", "w")
except IOError:
msg = "%s: can't write to file" % prefix + ".unique_snps.tped"
raise ProgramError(msg)
nbSNP = 0
with open(fileName, 'r') as inputFile:
for line in inputFile:
nbSNP += 1
row = line.rstrip("\r\n").split("\t")
snpInfo = row[:4]
genotype = [i.upper() for i in row[4:]]
chromosome = snpInfo[0]
position = snpInfo[3]
if (chromosome, position) in uniqueSNPs:
# Printing the new TPED file (unique SNPs only)
print >>outputFile, "\t".join(snpInfo + genotype)
else:
# Saving the TPED file (duplicated samples only)
currPos = len(tped)
tped.append(tuple(snpInfo + genotype))
updatedSNPs[(chromosome, position)].append(currPos)
outputFile.close()
if len(mapF) != nbSNP:
msg = "%(fileName)s: no the same number of SNPs than MAP " \
"file" % locals()
raise ProgramError(msg)
tped = np.array(tped)
return tped, updatedSNPs
[docs]def findUniques(mapF):
"""Finds the unique markers in a MAP.
:param mapF: representation of a ``map`` file.
:type mapF: list
:returns: a :py:class:`dict` containing unique markers (according to their
genomic localisation).
"""
uSNPs = {}
dSNPs = defaultdict(list)
for i, row in enumerate(mapF):
chromosome = row[0]
position = row[3]
snpID = (chromosome, position)
if snpID not in uSNPs:
# This is the first time we see this sample
uSNPs[snpID] = i
else:
# We have seen this sample at least once...
if snpID not in dSNPs:
# This is the second time we see this sample...
dSNPs[snpID].extend([uSNPs[snpID], i])
else:
# We have seen this sample multiple times
dSNPs[snpID].append(i)
# Removing the duplicates from the unique samples
for snpID in dSNPs.iterkeys():
if snpID in uSNPs:
del uSNPs[snpID]
return uSNPs
[docs]def readTFAM(fileName):
"""Reads the TFAM file.
:param fileName: the name of the ``tfam`` file.
:type fileName: str
:returns: a representation the ``tfam`` file (:py:class:`numpy.array`).
"""
# Saving the TFAM file
tfam = None
with open(fileName, 'r') as inputFile:
tfam = [
tuple(i.rstrip("\r\n").split("\t")) for i in inputFile.readlines()
]
tfam = np.array(tfam)
return tfam
[docs]def readMAP(fileName, prefix):
"""Reads the MAP file.
:param fileName: the name of the ``map`` file.
:type fileName: str
:returns: a list of tuples, representing the ``map`` file.
While reading the ``map`` file, it saves a file
(``prefix.duplicated_marker_names``) containing the name of the unique
duplicated markers.
"""
# Saving the MAP file
mapF = None
with open(fileName, 'r') as inputFile:
mapF = [
tuple(i.rstrip("\r\n").split("\t")) for i in inputFile.readlines()
]
# Test for uniqueness of names
marker_names = np.array([i[1] for i in mapF])
nb_with_same_name = len(marker_names) - len(np.unique(marker_names))
if nb_with_same_name > 0:
logger.info(" - {} markers with same name".format(nb_with_same_name))
u, indices = np.unique(marker_names, return_index=True)
duplicated_indices = np.setdiff1d(np.arange(len(marker_names)),
indices)
duplicated_indices = np.in1d(marker_names,
marker_names[duplicated_indices])
marker_chr_pos = np.array([(i[0], i[3]) for i in mapF],
dtype=[("chr", int),
("pos", int)])[duplicated_indices]
marker_names = marker_names[duplicated_indices]
try:
duplicated_marker_names = open(prefix + ".duplicated_marker_names",
"w")
except IOError:
msg = "{}.duplicated_marker_names: can't write file".format(prefix)
raise ProgramError(msg)
for i in xrange(len(marker_names)):
print >>duplicated_marker_names, "\t".join([marker_names[i]] +
map(str,
marker_chr_pos[i]))
duplicated_marker_names.close()
return mapF
[docs]def checkArgs(args):
"""Checks the arguments and options.
:param args: an object containing the options of the program.
:type args: argparse.Namespace
:returns: ``True`` if everything was OK.
If there is a problem with an option, an exception is raised using the
:py:class:`ProgramError` class, a message is printed to the
:class:`sys.stderr` and the program exists with code 1.
"""
# Checking the input files
for suffix in [".tped", ".tfam", ".map"]:
fileName = args.tfile + suffix
if not os.path.isfile(fileName):
msg = "%(fileName)s: no such file" % locals()
raise ProgramError(msg)
# Checking the concordance threshold
if ((args.snp_concordance_threshold < 0) or
(args.snp_concordance_threshold > 1)):
msg = "snp-concordance-threshold: must be between 0 and 1 " \
"(not %f)" % args.snp_concordance_threshold
raise ProgramError(msg)
# Checking the completion threshold
if ((args.snp_completion_threshold < 0) or
(args.snp_completion_threshold > 1)):
msg = "snp-completion-threshold: must be between 0 and 1 " \
"(not %f)" % args.snp_completion_threshold
raise ProgramError(msg)
# Checking the difference in frequency
if (args.frequency_difference < 0) or (args.frequency_difference > 1):
msg = ("{}: maximal frequency difference: value must be between 0 and "
"1 (inclusively)".format())
raise ProgramError(msg)
return True
[docs]def parseArgs(argString=None): # pragma: no cover
"""Parses the command line options and arguments.
:param argString: the options.
:type argString: list
:returns: A :py:class:`argparse.Namespace` object created by the
:py:mod:`argparse` module. It contains the values of the
different options.
=============================== ====== ====================================
Options Type Description
=============================== ====== ====================================
``--tfile`` string The input file prefix (Plink tfile).
``--snp-completion-threshold`` float The completion threshold to consider
a replicate when choosing the best
replicates.
``--snp-concordance-threshold`` float The concordance threshold to
consider a replicate when choosing
the best replicates.
``--frequency_difference`` float The maximum difference in frequency
between duplicated markers.
``--out`` string The prefix of the output files.
=============================== ====== ====================================
.. note::
No option check is done here (except for the one automatically done by
argparse). Those need to be done elsewhere (see :py:func:`checkArgs`).
"""
args = None
if argString is None:
args = parser.parse_args()
else:
args = parser.parse_args(argString)
return args
[docs]class ProgramError(Exception):
"""An :py:class:`Exception` raised in case of a problem.
:param msg: the message to print to the user before exiting.
:type msg: str
"""
def __init__(self, msg):
"""Construction of the :py:class:`ProgramError` class.
:param msg: the message to print to the user
:type msg: str
"""
self.message = str(msg)
def __str__(self):
return self.message
# The parser object
pretty_name = "Duplicated markers"
desc = "Extracts and merges duplicated markers."
long_desc = ("The script searches for duplicated markers according to "
"chromosomal location. It then evaluates concordance, completion "
"rate, allele calls and minor allele frequency (MAF). The script "
"keeps markers with different allele calls or with different "
"MAF. If thresholds are met, the script merges and completes the "
"genotypes.")
parser = argparse.ArgumentParser(description=desc)
parser.add_argument("-v", "--version", action="version",
version="pyGenClean version {}".format(__version__))
# The INPUT files
group = parser.add_argument_group("Input File")
group.add_argument("--tfile", type=str, metavar="FILE", required=True,
help=("The input file prefix (will find the tped and tfam "
"file by appending the prefix to .tped and .tfam, "
"respectively. A .map file is also required."))
# The options
group = parser.add_argument_group("Options")
group.add_argument("--snp-completion-threshold", type=float, metavar="FLOAT",
default=0.9, help=("The completion threshold to consider "
"a replicate when choosing the best "
"replicates and for composite creation. "
"[default: %(default).1f]"))
group.add_argument("--snp-concordance-threshold", type=float, metavar="FLOAT",
default=0.98, help=("The concordance threshold to consider "
"a replicate when choosing the best "
"replicates and for composite "
"creation. [default: %(default).2f]"))
group.add_argument("--frequency_difference", type=float, metavar="FLOAT",
default=0.05, help=("The maximum difference in frequency "
"between duplicated markers [default: "
"%(default).2f]"))
# The OUTPUT files
group = parser.add_argument_group("Output File")
group.add_argument("--out", type=str, metavar="FILE", default="dup_snps",
help=("The prefix of the output files. [default: "
"%(default)s]"))
[docs]def safe_main():
"""A safe version of the main function (that catches ProgramError)."""
try:
main()
except KeyboardInterrupt:
logger.info("Cancelled by user")
sys.exit(0)
except ProgramError as e:
logger.error(e.message)
parser.error(e.message)
if __name__ == "__main__":
safe_main()