#!/usr/bin/env python2.7
# This file is part of pyGenClean.
#
# pyGenClean is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# pyGenClean is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# pyGenClean. If not, see <http://www.gnu.org/licenses/>.
import os
import sys
import logging
import argparse
import subprocess
from itertools import izip
from collections import defaultdict
from ..PlinkUtils import plot_MDS as PlotMDS
from ..PlinkUtils import createRowFromPlinkSpacedOutput
from .. import __version__
from . import find_outliers
from . import plot_eigenvalues as PlotEigenvalues
from ..DupSNPs.duplicated_snps import flipGenotype
from ..RelatedSamples import find_related_samples as Relatedness
logger = logging.getLogger("check_ethnicity")
[docs]class Dummy(object):
pass
[docs]def main(argString=None):
"""The main function.
:param argString: the options.
:type argString: list
These are the steps of this module:
1. Prints the options.
2. Finds the overlapping markers between the three reference panels and
the source panel (:py:func:`findOverlappingSNPsWithReference`).
3. Extract the required markers from all the data sets
(:py:func:`extractSNPs`).
4. Renames the reference panel's marker names to that they are the same as
the source panel (for all populations) (:py:func:`renameSNPs`).
5. Combines the three reference panels together
(:py:func:`combinePlinkBinaryFiles`).
6. Compute the frequency of all the markers from the reference and the
source panels (:py:func:`computeFrequency`).
7. Finds the markers to flip in the reference panel (when compared to the
source panel) (:py:func:`findFlippedSNPs`).
8. Excludes the unflippable markers from the reference and the source
panels (:py:func:`excludeSNPs`).
9. Flips the markers that need flipping in their reference panel
(:py:func:`flipSNPs`).
10. Combines the reference and the source panels
(:py:func:`combinePlinkBinaryFiles`).
11. Runs part of :py:mod:`pyGenClean.RelatedSamples.find_related_samples`
on the combined data set (:py:func:`runRelatedness`).
12. Creates the ``mds`` file from the combined data set and the result of
previous step (:py:func:`createMDSFile`).
13. Creates the population file (:py:func:`createPopulationFile`).
14. Plots the ``mds`` values (:py:func:`plotMDS`).
15. Finds the outliers of a given reference population
(:py:func:`find_the_outliers`).
16. If required, computes the Eigenvalues using smartpca
(:py:func:`compute_eigenvalues`).
17. If required, creates a scree plot from smartpca resutls
(:py:func:`create_scree_plot`).
"""
# Getting and checking the options
args = parseArgs(argString)
checkArgs(args)
logger.info("Options used:")
for key, value in vars(args).iteritems():
logger.info(" --{} {}".format(key.replace("_", "-"), value))
newBfile = None
popNames = ["CEU", "YRI", "JPT-CHB"]
referencePrefixes = [args.ceu_bfile, args.yri_bfile, args.jpt_chb_bfile]
if not args.skip_ref_pops:
# Find overlap with the reference file
logger.info("Finding overlapping SNPs between reference and "
"source panels")
findOverlappingSNPsWithReference(
prefix=args.bfile,
referencePrefixes=referencePrefixes,
referencePopulations=popNames,
outPrefix=args.out,
)
# Extract the required SNPs using Plink (reference panels)
logger.info("Extracting overlapping SNPs from the reference panels")
extractSNPs(
snpToExtractFileNames=[
args.out + ".{}_snp_to_extract".format(popName)
for popName in popNames
],
referencePrefixes=referencePrefixes,
popNames=popNames,
outPrefix=args.out + ".reference_panel",
runSGE=args.sge,
options=args,
)
# Extract the required SNPs using Plink (source panel)
logger.info("Extracting overlapping SNPs from the source panel")
extractSNPs(
snpToExtractFileNames=[args.out + ".source_snp_to_extract"],
referencePrefixes=[args.bfile],
popNames=["ALL"],
outPrefix=args.out + ".source_panel",
runSGE=False,
options=args,
)
# Renaming the reference file, so that the SNP names are the same
for pop in popNames:
logger.info("Renaming reference panel's SNPs {} to match source "
"panel".format(pop))
renameSNPs(
inPrefix=args.out + ".reference_panel.{}".format(pop),
updateFileName=args.out + ".{}_update_names".format(pop),
outPrefix=args.out + ".reference_panel.{}.rename".format(pop),
)
# Combining the reference panel
logger.info("Combining the reference panels")
combinePlinkBinaryFiles(
prefixes=[
args.out + ".reference_panel.{}.rename".format(pop)
for pop in popNames
],
outPrefix=args.out + ".reference_panel.ALL.rename",
)
# Computing the frequency (reference panel)
logger.info("Computing reference panel frequencies")
computeFrequency(
prefix=args.out + ".reference_panel.ALL.rename",
outPrefix=args.out + ".reference_panel.ALL.rename.frequency",
)
# Computing the frequency (source panel)
logger.info("Computing source panel frequencies")
computeFrequency(
prefix=args.out + ".source_panel.ALL",
outPrefix=args.out + ".source_panel.ALL.frequency",
)
# Finding the SNPs to flip and flip them in the reference panel
logger.info("Finding SNPs to flip or to exclude from reference panel")
findFlippedSNPs(
frqFile1=args.out + ".reference_panel.ALL.rename.frequency.frq",
frqFile2=args.out + ".source_panel.ALL.frequency.frq",
outPrefix=args.out,
)
# Excluding SNPs (reference panel)
logger.info("Excluding SNPs from reference panel")
excludeSNPs(
inPrefix=args.out + ".reference_panel.ALL.rename",
outPrefix=args.out + ".reference_panel.ALL.rename.cleaned",
exclusionFileName=args.out + ".snp_to_remove",
)
# Excluding SNPs (source panel)
logger.info("Excluding SNPs from source panel")
excludeSNPs(args.out + ".source_panel.ALL",
args.out + ".source_panel.ALL.cleaned",
args.out + ".snp_to_remove")
# Flipping the SNP that need to be flip in the reference
logger.info("Flipping SNPs in reference panel")
flipSNPs(
inPrefix=args.out + ".reference_panel.ALL.rename.cleaned",
outPrefix=args.out + ".reference_panel.ALL.rename.cleaned.flipped",
flipFileName=args.out + ".snp_to_flip_in_reference",
)
# Combining the reference panel
logger.info("Combining reference and source panels")
combinePlinkBinaryFiles(
prefixes=[args.out + ".reference_panel.ALL.rename.cleaned.flipped",
args.out + ".source_panel.ALL.cleaned"],
outPrefix=args.out + ".final_dataset_for_genome",
)
# Runing the relatedness step
logger.info("Creating the genome file using Plink")
newBfile = runRelatedness(
inputPrefix=args.out + ".final_dataset_for_genome",
outPrefix=args.out,
options=args,
)
else:
# Just run relatedness on the dataset
newBfile = runRelatedness(
inputPrefix=args.bfile,
outPrefix=args.out,
options=args,
)
# Creating the MDS file
logger.info("Creating the MDS file using Plink")
createMDSFile(
nb_components=args.nb_components,
inPrefix=newBfile,
outPrefix=args.out + ".mds",
genomeFileName=args.out + ".ibs.genome.genome",
)
if not args.skip_ref_pops:
# Creating the population files
logger.info("Creating a population file")
famFiles = [
args.out + ".reference_panel." + i + ".fam" for i in popNames
]
famFiles.append(args.out + ".source_panel.ALL.fam")
labels = popNames + ["SOURCE"]
createPopulationFile(
inputFiles=famFiles,
labels=labels,
outputFileName=args.out + ".population_file",
)
# Plot the MDS value
logger.info("Creating the MDS plot")
plotMDS(
inputFileName=args.out + ".mds.mds",
outPrefix=args.out + ".mds",
populationFileName=args.out + ".population_file",
options=args,
)
# Finding the outliers
logger.info("Finding the outliers")
find_the_outliers(
mds_file_name=args.out + ".mds.mds",
population_file_name=args.out + ".population_file",
ref_pop_name=args.outliers_of,
multiplier=args.multiplier,
out_prefix=args.out,
)
# De we need to create a scree plot?
if args.create_scree_plot:
# Computing the eigenvalues using smartpca
logger.info("Computing eigenvalues")
compute_eigenvalues(
in_prefix=args.out + ".ibs.pruned_data",
out_prefix=args.out + ".smartpca",
)
logger.info("Creating scree plot")
create_scree_plot(
in_filename=args.out + ".smartpca.evec.txt",
out_filename=args.out + ".smartpca.scree_plot.png",
plot_title=args.scree_plot_title,
)
[docs]def create_scree_plot(in_filename, out_filename, plot_title):
"""Creates a scree plot using smartpca results.
:param in_filename: the name of the input file.
:param out_filename: the name of the output file.
:param plot_title: the title of the scree plot.
:type in_filename: str
:type out_filename: str
:type plot_title: str
"""
# Constructing the arguments
scree_plot_args = ("--evec", in_filename, "--out", out_filename,
"--scree-plot-title", plot_title)
try:
# Executing the main method
PlotEigenvalues.main(argString=scree_plot_args)
except PlotEigenvalues.ProgramError as e:
msg = "PlotEigenvalues: {}".format(e)
raise ProgramError(msg)
[docs]def compute_eigenvalues(in_prefix, out_prefix):
"""Computes the Eigenvalues using smartpca from Eigensoft.
:param in_prefix: the prefix of the input files.
:param out_prefix: the prefix of the output files.
:type in_prefix: str
:type out_prefix: str
Creates a "parameter file" used by smartpca and runs it.
"""
# First, we create the parameter file
with open(out_prefix + ".parameters", "w") as o_file:
print >>o_file, "genotypename: " + in_prefix + ".bed"
print >>o_file, "snpname: " + in_prefix + ".bim"
print >>o_file, "indivname: " + in_prefix + ".fam"
print >>o_file, "evecoutname: " + out_prefix + ".evec.txt"
print >>o_file, "evaloutname: " + out_prefix + ".eval.txt"
print >>o_file, "numoutlieriter: 0"
print >>o_file, "altnormstyle: NO"
# Executing smartpca
command = ["smartpca", "-p", out_prefix + ".parameters"]
runCommand(command)
[docs]def find_the_outliers(mds_file_name, population_file_name, ref_pop_name,
multiplier, out_prefix):
"""Finds the outliers of a given population.
:param mds_file_name: the name of the ``mds`` file.
:param population_file_name: the name of the population file.
:param ref_pop_name: the name of the reference population for which to find
outliers from.
:param multiplier: the multiplier of the cluster standard deviation to
modify the strictness of the outlier removal procedure.
:param out_prefix: the prefix of the output file.
:type mds_file_name: str
:type population_file_name: str
:type ref_pop_name: str
:type multiplier: float
:type out_prefix: str
Uses the :py:mod:`pyGenClean.Ethnicity.find_outliers` modules to find
outliers. It requires the ``mds`` file created by :py:func:`createMDSFile`
and the population file created by :py:func:`createPopulationFile`.
"""
options = ["--mds", mds_file_name, "--population-file",
population_file_name, "--outliers-of", ref_pop_name,
"--multiplier", str(multiplier), "--out", out_prefix]
try:
find_outliers.main(options)
except find_outliers.ProgramError as e:
msg = "find_outliers: {}".format(e)
raise ProgramError(msg)
[docs]def createPopulationFile(inputFiles, labels, outputFileName):
"""Creates a population file.
:param inputFiles: the list of input files.
:param labels: the list of labels (corresponding to the input files).
:param outputFileName: the name of the output file.
:type inputFiles: list
:type labels: list
:type outputFileName: str
The ``inputFiles`` is in reality a list of ``tfam`` files composed of
samples. For each of those ``tfam`` files, there is a label associated with
it (representing the name of the population).
The output file consists of one row per sample, with the following three
columns: the family ID, the individual ID and the population of each
sample.
"""
outputFile = None
try:
outputFile = open(outputFileName, 'w')
except IOError:
msg = "%(outputFileName)s: can't write file"
raise ProgramError(msg)
for i in xrange(len(inputFiles)):
# For each file
fileName = inputFiles[i]
label = labels[i]
try:
with open(fileName, 'r') as inputFile:
for line in inputFile:
row = line.rstrip("\r\n").split(" ")
# Getting the informations
famID = row[0]
indID = row[1]
# Printing to file
print >>outputFile, "\t".join([famID, indID, label])
except IOError:
msg = "%(fileName)s: no such file" % locals()
raise ProgramError(msg)
# Closing the output file
outputFile.close()
[docs]def plotMDS(inputFileName, outPrefix, populationFileName, options):
"""Plots the MDS value.
:param inputFileName: the name of the ``mds`` file.
:param outPrefix: the prefix of the output files.
:param populationFileName: the name of the population file.
:param options: the options
:type inputFileName: str
:type outPrefix: str
:type populationFileName: str
:type options: argparse.Namespace
Plots the ``mds`` value according to the ``inputFileName`` file (``mds``)
and the ``populationFileName`` (the population file).
"""
# The options
plotMDS_options = Dummy()
plotMDS_options.file = inputFileName
plotMDS_options.out = outPrefix
plotMDS_options.format = options.format
plotMDS_options.title = options.title
plotMDS_options.xlabel = options.xlabel
plotMDS_options.ylabel = options.ylabel
plotMDS_options.population_file = populationFileName
try:
# Checking the options
PlotMDS.checkArgs(plotMDS_options)
# Reading the population file
populations = PlotMDS.readPopulations(plotMDS_options.population_file)
# Getting the data
data, labels = PlotMDS.extractData(plotMDS_options.file, populations)
order = [labels.index("CEU"), labels.index("YRI"),
labels.index("JPT-CHB"), labels.index("SOURCE")]
if "HIGHLIGHT" in labels:
order.append(labels.index("HIGHLIGHT"))
color = [(0.215686275, 0.000494118, 0.721568627),
(0.301960784, 0.68627451, 0.290196078),
(0.596078431, 0.305882353, 0.639215686),
(0.894117647, 0.101960784, 0.109803922),
(0.596078431, 0.305882353, 0.639215686)]
sizes = [12, 12, 12, 8, 12]
markers = [".", ".", ".", "+", "D"]
# Plotting the data
PlotMDS.plotMDS(data, order, labels, color, sizes, markers,
plotMDS_options)
except PlotMDS.ProgramError as e:
msg = "PlotMDS: %s" % e
raise ProgramError(msg)
[docs]def createMDSFile(nb_components, inPrefix, outPrefix, genomeFileName):
"""Creates a MDS file using Plink.
:param nb_components: the number of component.
:param inPrefix: the prefix of the input file.
:param outPrefix: the prefix of the output file.
:param genomeFileName: the name of the ``genome`` file.
:type nb_components: int
:type inPrefix: str
:type outPrefix: str
:type genomeFileName: str
Using Plink, computes the MDS values for each individual using the
``inPrefix``, ``genomeFileName`` and the number of components. The results
are save using the ``outPrefix`` prefix.
"""
plinkCommand = ["plink", "--noweb", "--bfile", inPrefix, "--read-genome",
genomeFileName, "--cluster", "--mds-plot",
str(nb_components), "--out", outPrefix]
runCommand(plinkCommand)
[docs]def allFileExists(fileList):
"""Check that all file exists.
:param fileList: the list of file to check.
:type fileList: list
Check if all the files in ``fileList`` exists.
"""
allExists = True
for fileName in fileList:
allExists = allExists and os.path.isfile(fileName)
return allExists
[docs]def flipSNPs(inPrefix, outPrefix, flipFileName):
"""Flip SNPs using Plink.
:param inPrefix: the prefix of the input file.
:param outPrefix: the prefix of the output file.
:param flipFileName: the name of the file containing the markers to flip.
:type inPrefix: str
:type outPrefix: str
:type flipFileName: str
Using Plink, flip a set of markers in ``inPrefix``, and saves the results
in ``outPrefix``. The list of markers to be flipped is in ``flipFileName``.
"""
plinkCommand = ["plink", "--noweb", "--bfile", inPrefix, "--flip",
flipFileName, "--make-bed", "--out", outPrefix]
runCommand(plinkCommand)
[docs]def excludeSNPs(inPrefix, outPrefix, exclusionFileName):
"""Exclude some SNPs using Plink.
:param inPrefix: the prefix of the input file.
:param outPrefix: the prefix of the output file.
:param exclusionFileName: the name of the file containing the markers to be
excluded.
:type inPrefix: str
:type outPrefix: str
:type exclusionFileName: str
Using Plink, exclude a list of markers from ``inPrefix``, and saves the
results in ``outPrefix``. The list of markers are in ``exclusionFileName``.
"""
plinkCommand = ["plink", "--noweb", "--bfile", inPrefix, "--exclude",
exclusionFileName, "--make-bed", "--out", outPrefix]
runCommand(plinkCommand)
[docs]def renameSNPs(inPrefix, updateFileName, outPrefix):
"""Updates the name of the SNPs using Plink.
:param inPrefix: the prefix of the input file.
:param updateFileName: the name of the file containing the updated marker
names.
:param outPrefix: the prefix of the output file.
:type inPrefix: str
:type updateFileName: str
:type outPrefix: str
Using Plink, changes the name of the markers in ``inPrefix`` using
``updateFileName``. It saves the results in ``outPrefix``.
"""
plinkCommand = ["plink", "--noweb", "--bfile", inPrefix, "--update-map",
updateFileName, "--update-name", "--make-bed", "--out",
outPrefix]
runCommand(plinkCommand)
[docs]def findFlippedSNPs(frqFile1, frqFile2, outPrefix):
"""Find flipped SNPs and flip them in the data.
:param frqFile1: the name of the first frequency file.
:param frqFile2: the name of the second frequency file.
:param outPrefix: the prefix of the output files.
:type frqFile1: str
:type frqFile2: str
:type outPrefix: str
By reading two frequency files (``frqFile1`` and ``frqFile2``), it finds a
list of markers that need to be flipped so that the first file becomes
comparable with the second one. Also finds marker that need to be removed.
A marker needs to be flipped in one of the two data set if the two markers
are not comparable (same minor allele), but become comparable if we flip
one of them.
A marker will be removed if it is all homozygous in at least one data set.
It will also be removed if it's impossible to determine the phase of the
marker (*e.g.* if the two alleles are ``A`` and ``T`` or ``C`` and ``G``).
"""
allelesFiles = [{}, {}]
files = [frqFile1, frqFile2]
for k, fileName in enumerate(files):
try:
with open(fileName, "r") as inputFile:
headerIndex = None
for i, line in enumerate(inputFile):
row = createRowFromPlinkSpacedOutput(line)
if i == 0:
# This is the header
headerIndex = dict([
(row[j], j) for j in xrange(len(row))
])
# Checking the columns
for columnName in ["SNP", "A1", "A2"]:
if columnName not in headerIndex:
msg = "%(fileName)s: no column named " \
"%(columnName)s" % locals()
raise ProgramError(msg)
else:
snpName = row[headerIndex["SNP"]]
allele1 = row[headerIndex["A1"]]
allele2 = row[headerIndex["A2"]]
alleles = set([allele1, allele2])
if "0" in alleles:
alleles.remove("0")
allelesFiles[k][snpName] = alleles
except IOError:
msg = "%(fileName)s: no such file" % locals()
raise ProgramError(msg)
allelesFile1, allelesFile2 = allelesFiles
# Finding the SNPs to flip
toFlipOutputFile = None
try:
toFlipOutputFile = open(outPrefix + ".snp_to_flip_in_reference", "w")
except IOError:
msg = "%(outPrefix)s.snp_to_flip_in_reference: can't write " \
"file" % locals()
raise ProgramError(msg)
toRemoveOutputFile = None
try:
toRemoveOutputFile = open(outPrefix + ".snp_to_remove", "w")
except IOError:
msg = "%(outPrefix)s.snp_to_remove: can't write file" % locals()
raise ProgramError(msg)
for snpName in allelesFile1.iterkeys():
alleles1 = allelesFile1[snpName]
alleles2 = allelesFile2[snpName]
if (len(alleles1) == 2) and (len(alleles2) == 2):
# Both are heterozygous
if ({"A", "T"} == alleles1) or ({"C", "G"} == alleles1) or \
({"A", "T"} == alleles2) or ({"C", "G"} == alleles2):
# We can't flip those..., so we remove them
print >>toRemoveOutputFile, snpName
else:
if alleles1 != alleles2:
# Let's try the flip one
if flipGenotype(alleles1) == alleles2:
# We need to flip it
print >>toFlipOutputFile, snpName
else:
# Those SNP are discordant...
print >>toRemoveOutputFile, snpName
else:
# We want to remove this SNP, because there is at least one
# homozygous individual
print >>toRemoveOutputFile, snpName
# Closing output files
toFlipOutputFile.close()
toRemoveOutputFile.close()
[docs]def computeFrequency(prefix, outPrefix):
"""Compute the frequency using Plink.
:param prefix: the prefix of the file binary file for which we need to
compute frequencies.
:param outPrefix: the prefix of the output files.
:type prefix: str
:type outPrefix: str
Uses Plink to compute the frequency of all the markers in the ``prefix``
binary file.
"""
plinkCommand = ["plink", "--noweb", "--bfile", prefix, "--freq", "--out",
outPrefix]
runCommand(plinkCommand)
[docs]def combinePlinkBinaryFiles(prefixes, outPrefix):
"""Combine Plink binary files.
:param prefixes: a list of the prefix of the files that need to be
combined.
:param outPrefix: the prefix of the output file (the combined file).
:type prefixes: list
:type outPrefix: str
It uses Plink to merge a list of binary files (which is a list of prefixes
(strings)), and create the final data set which as ``outPrefix`` as the
prefix.
"""
# The first file is the bfile, the others are the ones to merge
outputFile = None
try:
outputFile = open(outPrefix + ".files_to_merge", "w")
except IOError:
msg = "%(outPrefix)s.filesToMerge: can't write file" % locals()
raise ProgramError(msg)
for prefix in prefixes[1:]:
print >>outputFile, " ".join([
prefix + i for i in [".bed", ".bim", ".fam"]
])
# Closing the output files
outputFile.close()
# Runing plink
plinkCommand = ["plink", "--noweb", "--bfile", prefixes[0],
"--merge-list", outPrefix + ".files_to_merge",
"--make-bed", "--out", outPrefix]
runCommand(plinkCommand)
[docs]def findOverlappingSNPsWithReference(prefix, referencePrefixes,
referencePopulations, outPrefix):
"""Find the overlapping SNPs in 4 different data sets.
:param prefix: the prefix of all the files.
:param referencePrefixes: the prefix of the reference population files.
:param referencePopulations: the name of the reference population (same
order as ``referencePrefixes``)
:param outPrefix: the prefix of the output files.
:type prefix: str
:type referencePrefixes: list
:type referencePopulations: list
:type outPrefix: str
It starts by reading the ``bim`` file of the source data set
(``prefix.bim``). It finds all the markers (excluding the duplicated ones).
Then it reads all of the reference population ``bim`` files
(``referencePrefixes.bim``) and find all the markers that were found in the
source data set.
It creates three output files:
* ``outPrefix.ref_snp_to_extract``: the name of the markers that needs to
be extracted from the three reference panels.
* ``outPrefix.source_snp_to_extract``: the name of the markers that needs
to be extracted from the source panel.
* ``outPrefix.update_names``: a file (readable by Plink) that will help in
changing the names of the selected markers in the reference panels, so
that they become comparable with the source panel.
"""
# Reading the main file
sourceSnpToExtract = {}
duplicates = set()
try:
with open(prefix + ".bim", "r") as inputFile:
for line in inputFile:
row = line.rstrip("\r\n").split("\t")
chromosome = row[0]
position = row[3]
snpName = row[1]
if (chromosome, position) not in sourceSnpToExtract:
sourceSnpToExtract[(chromosome, position)] = snpName
else:
# It's a duplicate
duplicates.add((chromosome, position))
except IOError:
msg = "%s.bim: no such file" % prefix
raise ProgramError(msg)
# Removing duplicates from the list
for snpID in duplicates:
del sourceSnpToExtract[snpID]
# Reading each of the reference markers
refSnpToExtract = defaultdict(dict)
refSnp = defaultdict(set)
for refPop, refPrefix in izip(referencePopulations, referencePrefixes):
try:
with open(refPrefix + ".bim", "r") as inputFile:
for line in inputFile:
row = line.rstrip("\r\n").split("\t")
chromosome = row[0]
position = row[3]
snpName = row[1]
key = (chromosome, position)
if key in sourceSnpToExtract:
# We want this SNP
refSnpToExtract[refPop][key] = snpName
refSnp[refPop].add(key)
logger.info(" - {:,d} overlaps with {}".format(
len(refSnp[refPop]),
refPrefix,
))
except IOError:
msg = "%(refPrefix)s.bim: no such file" % locals()
raise ProgramError(msg)
# Creating the intersect of the reference SNP
refSnpIntersect = refSnp[referencePopulations[0]]
for refPop in referencePopulations[1:]:
refSnpIntersect &= refSnp[refPop]
logger.info(" - {:,d} in common between reference "
"panels".format(len(refSnpIntersect)))
# Printing the names of the SNPs to extract in the reference
refOutputFiles = {}
try:
for refPop in referencePopulations:
refOutputFiles[refPop] = open(
outPrefix + ".{}_snp_to_extract".format(refPop),
mode="w",
)
except IOError:
msg = "{}.POP_snp_to_extract: can't write file".format(outPrefix)
raise ProgramError(msg)
sourceOutputFile = None
try:
sourceOutputFile = open(outPrefix + ".source_snp_to_extract", "w")
except IOError:
msg = "{}.source_snp_to_extract: can't write file".format(outPrefix)
raise ProgramError(msg)
changeNameOutputFiles = {}
try:
for refPop in referencePopulations:
changeNameOutputFiles[refPop] = open(
outPrefix + ".{}_update_names".format(refPop),
mode="w",
)
except IOError:
msg = "%(outPrefix)s.updateNames: can't write file" % locals()
raise ProgramError(msg)
# Writing the file containing the SNPs to extract in the source
for snpID in refSnpIntersect:
print >>sourceOutputFile, sourceSnpToExtract[snpID]
for refPop in referencePopulations:
print >>refOutputFiles[refPop], refSnpToExtract[refPop][snpID]
print >>changeNameOutputFiles[refPop], "\t".join([
refSnpToExtract[refPop][snpID],
sourceSnpToExtract[snpID],
])
# Closing the output file
sourceOutputFile.close()
for refOutputFile in refOutputFiles.itervalues():
refOutputFile.close()
for changeNameOutputFile in changeNameOutputFiles.itervalues():
changeNameOutputFile.close()
[docs]def runCommand(command):
"""Run a command.
:param command: the command to run.
:type command: list
Tries to run a command. If it fails, raise a :py:class:`ProgramError`. This
function uses the :py:mod:`subprocess` module.
.. warning::
The variable ``command`` should be a list of strings (no other type).
"""
output = None
try:
output = subprocess.check_output(
command,
stderr=subprocess.STDOUT,
shell=False,
)
except subprocess.CalledProcessError:
msg = "couldn't run command\n" + " ".join(command)
raise ProgramError(msg)
[docs]def checkArgs(args):
"""Checks the arguments and options.
:param args: an object containing the options of the program.
:type args: argparse.Namespace
:returns: ``True`` if everything was OK.
If there is a problem with an option, an exception is raised using the
:py:class:`ProgramError` class, a message is printed to the
:py:class:`sys.stderr` and the program exists with code 1.
"""
# Check if we have the tped and the tfam files
prefixes_to_check = [args.bfile]
if not args.skip_ref_pops:
for pop in ("ceu", "yri", "jpt_chb"):
if vars(args)["{}_bfile".format(pop)] is None:
raise ProgramError("argument --{}-bfile is "
"required".format(pop.replace("_", "-")))
prefixes_to_check += [
args.ceu_bfile,
args.yri_bfile,
args.jpt_chb_bfile,
]
for prefix in prefixes_to_check:
if prefix is None:
msg = "no input file"
raise ProgramError(msg)
for fileName in [prefix + i for i in [".bed", ".bim", ".fam"]]:
if not os.path.isfile(fileName):
msg = "%(fileName)s: no such file" % locals()
raise ProgramError(msg)
# Check the indep-pairwise option
# The two first must be int, the last one float
try:
for i in xrange(2):
tmp = int(args.indep_pairwise[i])
tmp = float(args.indep_pairwise[2])
except ValueError:
msg = "indep-pairwise: need INT INT FLOAT"
raise ProgramError(msg)
# Check the maf value
tmpMAF = None
try:
tmpMAF = float(args.maf)
except ValueError:
msg = "maf: must be a float, not %s" % args.maf
raise ProgramError(msg)
if (tmpMAF > 0.5) or (tmpMAF < 0.0):
msg = "maf: must be between 0.0 and 0.5, not %s" % args.maf
raise ProgramError(msg)
# Check the number of line per file
if args.line_per_file_for_sge < 1:
msg = "line-per-file-for-sge: must be above 0, not " \
"%d" % args.line_per_file_for_sge
raise ProgramError(msg)
# Check the number of component to compute
if args.nb_components < 2 or args.nb_components > 10:
msg = ("nb-components: must be between 2 and 10 (inclusive), "
"not {}".format(args.nb_components))
raise ProgramError(msg)
# Check the minimum number of SNPs
if args.min_nb_snp < 1:
msg = "min-nb-snp: must be above 1"
raise ProgramError(msg)
return True
[docs]def parseArgs(argString=None): # pragma: no cover
"""Parses the command line options and arguments.
:param argString: the options.
:type argString: list
:returns: A :py:class:`argparse.Namespace` object created by
the :py:mod:`argparse` module. It contains the values of the
different options.
=========================== ====== ========================================
Options Type Description
=========================== ====== ========================================
``--bfile`` string The input file prefix (Plink binary
file).
``--skip-ref-pops`` bool Perform the MDS computation, but skip
the three reference panels.
``--ceu-bfile`` string The input file prefix for the CEU
population (Plink binary file).
``--yri-bfile`` string The input file prefix for the YRI
population (Plink binary file).
``--jpt-chb-bfile`` string The input file prefix for the JPT-CHB
population (Plink binary file).
``--min-nb-snp`` int The minimum number of markers needed to
compute IBS.
``--indep-pairwise`` string Three numbers: window size, window shift
and the r2 threshold.
``--maf`` string Restrict to SNPs with MAF >= threshold.
``--sge`` bool Use SGE for parallelization.
``--sge-walltime`` int The time limit (for clusters).
``--sge-nodes`` int Two INTs (number of nodes and number of
int processor per nodes).
``--ibs-sge-walltime`` int The time limit (for clusters) (for IBS)
``--ibs-sge-nodes`` int Two INTs (number of nodes and number of
int processor per nodes) (for IBS).
``--line-per-file-for-sge`` int The number of line per file for SGE task
array.
``--nb-components`` int The number of component to compute.
``--outliers-of`` string Finds the ouliers of this population.
``--multiplier`` float To find the outliers, we look for more
than x times the cluster standard
deviation.
``--xaxis`` string The component to use for the X axis.
``--yaxis`` string The component to use for the Y axis.
``--format`` string The output file format.
``--title`` string The title of the MDS plot.
``--xlabel`` string The label of the X axis.
``--ylabel`` string The label of the Y axis.
``--create-scree-plot`` bool Computes Eigenvalues and creates a scree
plot.
``--scree-plot-title`` string The main title of the scree plot
``--out`` string The prefix of the output files.
=========================== ====== ========================================
.. note::
No option check is done here (except for the one automatically done by
argparse). Those need to be done elsewhere (see :py:func:`checkArgs`).
"""
args = None
if argString is None:
args = parser.parse_args()
else:
args = parser.parse_args(argString)
return args
[docs]class ProgramError(Exception):
"""An :py:class:`Exception` raised in case of a problem.
:param msg: the message to print to the user before exiting.
:type msg: str
"""
def __init__(self, msg):
"""Construction of the :py:class:`ProgramError` class.
:param msg: the message to print to the user
:type msg: str
"""
self.message = str(msg)
def __str__(self):
return self.message
# The parser object
pretty_name = "Ethnicity"
desc = "Checks sample's ethnicity using reference populations and IBS."
long_desc = ("The script uses pairwise IBS matrix as a distance metric to "
"identify cryptic relatedness among samples and sample outliers "
"by multi-dimensional scaling (MDS).")
parser = argparse.ArgumentParser(description=desc)
parser.add_argument("-v", "--version", action="version",
version="pyGenClean version {}".format(__version__))
# The INPUT files
group = parser.add_argument_group("Input File")
group.add_argument("--bfile", type=str, metavar="FILE", required=True,
help=("The input file prefix (will find the plink binary "
"files by appending the prefix to the .bim, .bed and "
".fam files, respectively."))
group.add_argument("--skip-ref-pops", action="store_true",
help=("Perform the MDS computation, but skip the three "
"reference panels."))
group.add_argument("--ceu-bfile", type=str, metavar="FILE",
help=("The input file prefix (will find the plink binary "
"files by appending the prefix to the .bim, .bed and "
".fam files, respectively.) for the CEU population"))
group.add_argument("--yri-bfile", type=str, metavar="FILE",
help=("The input file prefix (will find the plink binary "
"files by appending the prefix to the .bim, .bed and "
".fam files, respectively.) for the YRI population"))
group.add_argument("--jpt-chb-bfile", type=str, metavar="FILE",
help=("The input file prefix (will find the plink binary "
"files by appending the prefix to the .bim, .bed and "
".fam files, respectively.) for the JPT-CHB "
"population"))
# The options
group = parser.add_argument_group("Options")
group.add_argument("--min-nb-snp", type=int, metavar="INT", default=8000,
help=("The minimum number of markers needed to compute IBS "
"values. [Default: %(default)d]"))
group.add_argument("--indep-pairwise", type=str, metavar="STR",
nargs=3, default=["50", "5", "0.1"],
help=("Three numbers: window size, window shift and "
"the r2 threshold. [default: %(default)s]"))
group.add_argument("--maf", type=str, metavar="FLOAT", default="0.05",
help=("Restrict to SNPs with MAF >= threshold. "
"[default: %(default)s]"))
group.add_argument("--sge", action="store_true",
help="Use SGE for parallelization.")
group.add_argument("--sge-walltime", type=str, metavar="TIME",
help=("The walltime for the job to run on the cluster. Do "
"not use if you are not required to specify a "
"walltime for your jobs on your cluster (e.g. 'qsub "
"-lwalltime=1:0:0' on the cluster)."))
group.add_argument("--sge-nodes", type=int, metavar="INT", nargs=2,
help=("The number of nodes and the number of processor per "
"nodes to use (e.g. 'qsub -lnodes=X:ppn=Y' on the "
"cluster, where X is the number of nodes and Y is "
" the number of processor to use. Do not use if you "
"are not required to specify the number of nodes for "
"your jobs on the cluster."))
group.add_argument("--ibs-sge-walltime", type=str, metavar="TIME",
help=("The walltime for the IBS jobs to run on the "
"cluster. Do not use if you are not required to "
"specify a walltime for your jobs on your cluster "
"(e.g. 'qsub -lwalltime=1:0:0' on the cluster)."))
group.add_argument("--ibs-sge-nodes", type=int, metavar="INT", nargs=2,
help=("The number of nodes and the number of processor per "
"nodes to use for the IBS jobs (e.g. 'qsub "
"-lnodes=X:ppn=Y' on the cluster, where X is the "
"number of nodes and Y is the number of processor to "
"use. Do not use if you are not required to specify "
"the number of nodes for your jobs on the cluster."))
group.add_argument("--line-per-file-for-sge", type=int, metavar="INT",
default=100, help=("The number of line per file for SGE "
"task array for the IBS jobs. [default: "
"%(default)d]"))
group.add_argument("--nb-components", type=int, metavar="INT", default=10,
help=("The number of component to compute. [default: "
"%(default)d]"))
# The outlier options
group = parser.add_argument_group("Outlier Options")
find_outliers.add_custom_options(group)
# The MDS plotting options
group = parser.add_argument_group("MDS Plot Options")
PlotMDS.addCustomOptions(group)
# The Scree Plot options
group = parser.add_argument_group("Scree Plot Options")
group.add_argument("--create-scree-plot", action="store_true",
help="Computes Eigenvalues and creates a scree plot.")
PlotEigenvalues.add_custom_options(group)
# The OUTPUT files
group = parser.add_argument_group("Output File")
group.add_argument("--out", type=str, metavar="FILE", default="ethnicity",
help=("The prefix of the output files. [default: "
"%(default)s]"))
[docs]def safe_main():
"""A safe version of the main function (that catches ProgramError)."""
try:
main()
except KeyboardInterrupt:
logger.info("Cancelled by user")
sys.exit(0)
except ProgramError as e:
logger.error(e.message)
parser.error(e.message)
if __name__ == "__main__":
safe_main()