Source code for pyGenClean.Ethnicity.check_ethnicity

#!/usr/bin/env python2.7

# This file is part of pyGenClean.
#
# pyGenClean is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# pyGenClean is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# pyGenClean.  If not, see <http://www.gnu.org/licenses/>.


import os
import sys
import logging
import argparse
import subprocess
from itertools import izip
from collections import defaultdict

from ..PlinkUtils import plot_MDS as PlotMDS
from ..PlinkUtils import createRowFromPlinkSpacedOutput

from .. import __version__
from . import find_outliers
from . import plot_eigenvalues as PlotEigenvalues
from ..DupSNPs.duplicated_snps import flipGenotype
from ..RelatedSamples import find_related_samples as Relatedness


logger = logging.getLogger("check_ethnicity")


[docs]class Dummy(object): pass
[docs]def main(argString=None): """The main function. :param argString: the options. :type argString: list These are the steps of this module: 1. Prints the options. 2. Finds the overlapping markers between the three reference panels and the source panel (:py:func:`findOverlappingSNPsWithReference`). 3. Extract the required markers from all the data sets (:py:func:`extractSNPs`). 4. Renames the reference panel's marker names to that they are the same as the source panel (for all populations) (:py:func:`renameSNPs`). 5. Combines the three reference panels together (:py:func:`combinePlinkBinaryFiles`). 6. Compute the frequency of all the markers from the reference and the source panels (:py:func:`computeFrequency`). 7. Finds the markers to flip in the reference panel (when compared to the source panel) (:py:func:`findFlippedSNPs`). 8. Excludes the unflippable markers from the reference and the source panels (:py:func:`excludeSNPs`). 9. Flips the markers that need flipping in their reference panel (:py:func:`flipSNPs`). 10. Combines the reference and the source panels (:py:func:`combinePlinkBinaryFiles`). 11. Runs part of :py:mod:`pyGenClean.RelatedSamples.find_related_samples` on the combined data set (:py:func:`runRelatedness`). 12. Creates the ``mds`` file from the combined data set and the result of previous step (:py:func:`createMDSFile`). 13. Creates the population file (:py:func:`createPopulationFile`). 14. Plots the ``mds`` values (:py:func:`plotMDS`). 15. Finds the outliers of a given reference population (:py:func:`find_the_outliers`). 16. If required, computes the Eigenvalues using smartpca (:py:func:`compute_eigenvalues`). 17. If required, creates a scree plot from smartpca resutls (:py:func:`create_scree_plot`). """ # Getting and checking the options args = parseArgs(argString) checkArgs(args) logger.info("Options used:") for key, value in vars(args).iteritems(): logger.info(" --{} {}".format(key.replace("_", "-"), value)) newBfile = None popNames = ["CEU", "YRI", "JPT-CHB"] referencePrefixes = [args.ceu_bfile, args.yri_bfile, args.jpt_chb_bfile] if not args.skip_ref_pops: # Find overlap with the reference file logger.info("Finding overlapping SNPs between reference and " "source panels") findOverlappingSNPsWithReference( prefix=args.bfile, referencePrefixes=referencePrefixes, referencePopulations=popNames, outPrefix=args.out, ) # Extract the required SNPs using Plink (reference panels) logger.info("Extracting overlapping SNPs from the reference panels") extractSNPs( snpToExtractFileNames=[ args.out + ".{}_snp_to_extract".format(popName) for popName in popNames ], referencePrefixes=referencePrefixes, popNames=popNames, outPrefix=args.out + ".reference_panel", runSGE=args.sge, options=args, ) # Extract the required SNPs using Plink (source panel) logger.info("Extracting overlapping SNPs from the source panel") extractSNPs( snpToExtractFileNames=[args.out + ".source_snp_to_extract"], referencePrefixes=[args.bfile], popNames=["ALL"], outPrefix=args.out + ".source_panel", runSGE=False, options=args, ) # Renaming the reference file, so that the SNP names are the same for pop in popNames: logger.info("Renaming reference panel's SNPs {} to match source " "panel".format(pop)) renameSNPs( inPrefix=args.out + ".reference_panel.{}".format(pop), updateFileName=args.out + ".{}_update_names".format(pop), outPrefix=args.out + ".reference_panel.{}.rename".format(pop), ) # Combining the reference panel logger.info("Combining the reference panels") combinePlinkBinaryFiles( prefixes=[ args.out + ".reference_panel.{}.rename".format(pop) for pop in popNames ], outPrefix=args.out + ".reference_panel.ALL.rename", ) # Computing the frequency (reference panel) logger.info("Computing reference panel frequencies") computeFrequency( prefix=args.out + ".reference_panel.ALL.rename", outPrefix=args.out + ".reference_panel.ALL.rename.frequency", ) # Computing the frequency (source panel) logger.info("Computing source panel frequencies") computeFrequency( prefix=args.out + ".source_panel.ALL", outPrefix=args.out + ".source_panel.ALL.frequency", ) # Finding the SNPs to flip and flip them in the reference panel logger.info("Finding SNPs to flip or to exclude from reference panel") findFlippedSNPs( frqFile1=args.out + ".reference_panel.ALL.rename.frequency.frq", frqFile2=args.out + ".source_panel.ALL.frequency.frq", outPrefix=args.out, ) # Excluding SNPs (reference panel) logger.info("Excluding SNPs from reference panel") excludeSNPs( inPrefix=args.out + ".reference_panel.ALL.rename", outPrefix=args.out + ".reference_panel.ALL.rename.cleaned", exclusionFileName=args.out + ".snp_to_remove", ) # Excluding SNPs (source panel) logger.info("Excluding SNPs from source panel") excludeSNPs(args.out + ".source_panel.ALL", args.out + ".source_panel.ALL.cleaned", args.out + ".snp_to_remove") # Flipping the SNP that need to be flip in the reference logger.info("Flipping SNPs in reference panel") flipSNPs( inPrefix=args.out + ".reference_panel.ALL.rename.cleaned", outPrefix=args.out + ".reference_panel.ALL.rename.cleaned.flipped", flipFileName=args.out + ".snp_to_flip_in_reference", ) # Combining the reference panel logger.info("Combining reference and source panels") combinePlinkBinaryFiles( prefixes=[args.out + ".reference_panel.ALL.rename.cleaned.flipped", args.out + ".source_panel.ALL.cleaned"], outPrefix=args.out + ".final_dataset_for_genome", ) # Runing the relatedness step logger.info("Creating the genome file using Plink") newBfile = runRelatedness( inputPrefix=args.out + ".final_dataset_for_genome", outPrefix=args.out, options=args, ) else: # Just run relatedness on the dataset newBfile = runRelatedness( inputPrefix=args.bfile, outPrefix=args.out, options=args, ) # Creating the MDS file logger.info("Creating the MDS file using Plink") createMDSFile( nb_components=args.nb_components, inPrefix=newBfile, outPrefix=args.out + ".mds", genomeFileName=args.out + ".ibs.genome.genome", ) if not args.skip_ref_pops: # Creating the population files logger.info("Creating a population file") famFiles = [ args.out + ".reference_panel." + i + ".fam" for i in popNames ] famFiles.append(args.out + ".source_panel.ALL.fam") labels = popNames + ["SOURCE"] createPopulationFile( inputFiles=famFiles, labels=labels, outputFileName=args.out + ".population_file", ) # Plot the MDS value logger.info("Creating the MDS plot") plotMDS( inputFileName=args.out + ".mds.mds", outPrefix=args.out + ".mds", populationFileName=args.out + ".population_file", options=args, ) # Finding the outliers logger.info("Finding the outliers") find_the_outliers( mds_file_name=args.out + ".mds.mds", population_file_name=args.out + ".population_file", ref_pop_name=args.outliers_of, multiplier=args.multiplier, out_prefix=args.out, ) # De we need to create a scree plot? if args.create_scree_plot: # Computing the eigenvalues using smartpca logger.info("Computing eigenvalues") compute_eigenvalues( in_prefix=args.out + ".ibs.pruned_data", out_prefix=args.out + ".smartpca", ) logger.info("Creating scree plot") create_scree_plot( in_filename=args.out + ".smartpca.evec.txt", out_filename=args.out + ".smartpca.scree_plot.png", plot_title=args.scree_plot_title, )
[docs]def create_scree_plot(in_filename, out_filename, plot_title): """Creates a scree plot using smartpca results. :param in_filename: the name of the input file. :param out_filename: the name of the output file. :param plot_title: the title of the scree plot. :type in_filename: str :type out_filename: str :type plot_title: str """ # Constructing the arguments scree_plot_args = ("--evec", in_filename, "--out", out_filename, "--scree-plot-title", plot_title) try: # Executing the main method PlotEigenvalues.main(argString=scree_plot_args) except PlotEigenvalues.ProgramError as e: msg = "PlotEigenvalues: {}".format(e) raise ProgramError(msg)
[docs]def compute_eigenvalues(in_prefix, out_prefix): """Computes the Eigenvalues using smartpca from Eigensoft. :param in_prefix: the prefix of the input files. :param out_prefix: the prefix of the output files. :type in_prefix: str :type out_prefix: str Creates a "parameter file" used by smartpca and runs it. """ # First, we create the parameter file with open(out_prefix + ".parameters", "w") as o_file: print >>o_file, "genotypename: " + in_prefix + ".bed" print >>o_file, "snpname: " + in_prefix + ".bim" print >>o_file, "indivname: " + in_prefix + ".fam" print >>o_file, "evecoutname: " + out_prefix + ".evec.txt" print >>o_file, "evaloutname: " + out_prefix + ".eval.txt" print >>o_file, "numoutlieriter: 0" print >>o_file, "altnormstyle: NO" # Executing smartpca command = ["smartpca", "-p", out_prefix + ".parameters"] runCommand(command)
[docs]def find_the_outliers(mds_file_name, population_file_name, ref_pop_name, multiplier, out_prefix): """Finds the outliers of a given population. :param mds_file_name: the name of the ``mds`` file. :param population_file_name: the name of the population file. :param ref_pop_name: the name of the reference population for which to find outliers from. :param multiplier: the multiplier of the cluster standard deviation to modify the strictness of the outlier removal procedure. :param out_prefix: the prefix of the output file. :type mds_file_name: str :type population_file_name: str :type ref_pop_name: str :type multiplier: float :type out_prefix: str Uses the :py:mod:`pyGenClean.Ethnicity.find_outliers` modules to find outliers. It requires the ``mds`` file created by :py:func:`createMDSFile` and the population file created by :py:func:`createPopulationFile`. """ options = ["--mds", mds_file_name, "--population-file", population_file_name, "--outliers-of", ref_pop_name, "--multiplier", str(multiplier), "--out", out_prefix] try: find_outliers.main(options) except find_outliers.ProgramError as e: msg = "find_outliers: {}".format(e) raise ProgramError(msg)
[docs]def createPopulationFile(inputFiles, labels, outputFileName): """Creates a population file. :param inputFiles: the list of input files. :param labels: the list of labels (corresponding to the input files). :param outputFileName: the name of the output file. :type inputFiles: list :type labels: list :type outputFileName: str The ``inputFiles`` is in reality a list of ``tfam`` files composed of samples. For each of those ``tfam`` files, there is a label associated with it (representing the name of the population). The output file consists of one row per sample, with the following three columns: the family ID, the individual ID and the population of each sample. """ outputFile = None try: outputFile = open(outputFileName, 'w') except IOError: msg = "%(outputFileName)s: can't write file" raise ProgramError(msg) for i in xrange(len(inputFiles)): # For each file fileName = inputFiles[i] label = labels[i] try: with open(fileName, 'r') as inputFile: for line in inputFile: row = line.rstrip("\r\n").split(" ") # Getting the informations famID = row[0] indID = row[1] # Printing to file print >>outputFile, "\t".join([famID, indID, label]) except IOError: msg = "%(fileName)s: no such file" % locals() raise ProgramError(msg) # Closing the output file outputFile.close()
[docs]def plotMDS(inputFileName, outPrefix, populationFileName, options): """Plots the MDS value. :param inputFileName: the name of the ``mds`` file. :param outPrefix: the prefix of the output files. :param populationFileName: the name of the population file. :param options: the options :type inputFileName: str :type outPrefix: str :type populationFileName: str :type options: argparse.Namespace Plots the ``mds`` value according to the ``inputFileName`` file (``mds``) and the ``populationFileName`` (the population file). """ # The options plotMDS_options = Dummy() plotMDS_options.file = inputFileName plotMDS_options.out = outPrefix plotMDS_options.format = options.format plotMDS_options.title = options.title plotMDS_options.xlabel = options.xlabel plotMDS_options.ylabel = options.ylabel plotMDS_options.population_file = populationFileName try: # Checking the options PlotMDS.checkArgs(plotMDS_options) # Reading the population file populations = PlotMDS.readPopulations(plotMDS_options.population_file) # Getting the data data, labels = PlotMDS.extractData(plotMDS_options.file, populations) order = [labels.index("CEU"), labels.index("YRI"), labels.index("JPT-CHB"), labels.index("SOURCE")] if "HIGHLIGHT" in labels: order.append(labels.index("HIGHLIGHT")) color = [(0.215686275, 0.000494118, 0.721568627), (0.301960784, 0.68627451, 0.290196078), (0.596078431, 0.305882353, 0.639215686), (0.894117647, 0.101960784, 0.109803922), (0.596078431, 0.305882353, 0.639215686)] sizes = [12, 12, 12, 8, 12] markers = [".", ".", ".", "+", "D"] # Plotting the data PlotMDS.plotMDS(data, order, labels, color, sizes, markers, plotMDS_options) except PlotMDS.ProgramError as e: msg = "PlotMDS: %s" % e raise ProgramError(msg)
[docs]def createMDSFile(nb_components, inPrefix, outPrefix, genomeFileName): """Creates a MDS file using Plink. :param nb_components: the number of component. :param inPrefix: the prefix of the input file. :param outPrefix: the prefix of the output file. :param genomeFileName: the name of the ``genome`` file. :type nb_components: int :type inPrefix: str :type outPrefix: str :type genomeFileName: str Using Plink, computes the MDS values for each individual using the ``inPrefix``, ``genomeFileName`` and the number of components. The results are save using the ``outPrefix`` prefix. """ plinkCommand = ["plink", "--noweb", "--bfile", inPrefix, "--read-genome", genomeFileName, "--cluster", "--mds-plot", str(nb_components), "--out", outPrefix] runCommand(plinkCommand)
[docs]def runRelatedness(inputPrefix, outPrefix, options): """Run the relatedness step of the data clean up. :param inputPrefix: the prefix of the input file. :param outPrefix: the prefix of the output file. :param options: the options :type inputPrefix: str :type outPrefix: str :type options: argparse.Namespace :returns: the prefix of the new bfile. Runs :py:mod:`pyGenClean.RelatedSamples.find_related_samples` using the ``inputPrefix`` files and ``options`` options, and saves the results using the ``outPrefix`` prefix. """ # The options new_options = ["--bfile", inputPrefix, "--genome-only", "--min-nb-snp", str(options.min_nb_snp), "--maf", options.maf, "--out", "{}.ibs".format(outPrefix)] new_options += ["--indep-pairwise"] + options.indep_pairwise if options.sge: new_options.append("--sge") new_options += ["--line-per-file-for-sge", str(options.line_per_file_for_sge)] if options.ibs_sge_walltime is not None: new_options += ["--sge-walltime", options.ibs_sge_walltime] if options.ibs_sge_nodes is not None: new_options += ["--sge-nodes"] + map(str, options.ibs_sge_nodes) # Checking the input file if not allFileExists([inputPrefix + i for i in [".bed", ".bim", ".fam"]]): msg = "{}: not a valid binary prefix".format(inputPrefix) raise ProgramError(msg) newBfile = None try: newBfile = Relatedness.main(new_options) except Relatedness.ProgramError as e: msg = "compute genome: {}".format(e) raise ProgramError(msg) return newBfile
[docs]def allFileExists(fileList): """Check that all file exists. :param fileList: the list of file to check. :type fileList: list Check if all the files in ``fileList`` exists. """ allExists = True for fileName in fileList: allExists = allExists and os.path.isfile(fileName) return allExists
[docs]def flipSNPs(inPrefix, outPrefix, flipFileName): """Flip SNPs using Plink. :param inPrefix: the prefix of the input file. :param outPrefix: the prefix of the output file. :param flipFileName: the name of the file containing the markers to flip. :type inPrefix: str :type outPrefix: str :type flipFileName: str Using Plink, flip a set of markers in ``inPrefix``, and saves the results in ``outPrefix``. The list of markers to be flipped is in ``flipFileName``. """ plinkCommand = ["plink", "--noweb", "--bfile", inPrefix, "--flip", flipFileName, "--make-bed", "--out", outPrefix] runCommand(plinkCommand)
[docs]def excludeSNPs(inPrefix, outPrefix, exclusionFileName): """Exclude some SNPs using Plink. :param inPrefix: the prefix of the input file. :param outPrefix: the prefix of the output file. :param exclusionFileName: the name of the file containing the markers to be excluded. :type inPrefix: str :type outPrefix: str :type exclusionFileName: str Using Plink, exclude a list of markers from ``inPrefix``, and saves the results in ``outPrefix``. The list of markers are in ``exclusionFileName``. """ plinkCommand = ["plink", "--noweb", "--bfile", inPrefix, "--exclude", exclusionFileName, "--make-bed", "--out", outPrefix] runCommand(plinkCommand)
[docs]def renameSNPs(inPrefix, updateFileName, outPrefix): """Updates the name of the SNPs using Plink. :param inPrefix: the prefix of the input file. :param updateFileName: the name of the file containing the updated marker names. :param outPrefix: the prefix of the output file. :type inPrefix: str :type updateFileName: str :type outPrefix: str Using Plink, changes the name of the markers in ``inPrefix`` using ``updateFileName``. It saves the results in ``outPrefix``. """ plinkCommand = ["plink", "--noweb", "--bfile", inPrefix, "--update-map", updateFileName, "--update-name", "--make-bed", "--out", outPrefix] runCommand(plinkCommand)
[docs]def findFlippedSNPs(frqFile1, frqFile2, outPrefix): """Find flipped SNPs and flip them in the data. :param frqFile1: the name of the first frequency file. :param frqFile2: the name of the second frequency file. :param outPrefix: the prefix of the output files. :type frqFile1: str :type frqFile2: str :type outPrefix: str By reading two frequency files (``frqFile1`` and ``frqFile2``), it finds a list of markers that need to be flipped so that the first file becomes comparable with the second one. Also finds marker that need to be removed. A marker needs to be flipped in one of the two data set if the two markers are not comparable (same minor allele), but become comparable if we flip one of them. A marker will be removed if it is all homozygous in at least one data set. It will also be removed if it's impossible to determine the phase of the marker (*e.g.* if the two alleles are ``A`` and ``T`` or ``C`` and ``G``). """ allelesFiles = [{}, {}] files = [frqFile1, frqFile2] for k, fileName in enumerate(files): try: with open(fileName, "r") as inputFile: headerIndex = None for i, line in enumerate(inputFile): row = createRowFromPlinkSpacedOutput(line) if i == 0: # This is the header headerIndex = dict([ (row[j], j) for j in xrange(len(row)) ]) # Checking the columns for columnName in ["SNP", "A1", "A2"]: if columnName not in headerIndex: msg = "%(fileName)s: no column named " \ "%(columnName)s" % locals() raise ProgramError(msg) else: snpName = row[headerIndex["SNP"]] allele1 = row[headerIndex["A1"]] allele2 = row[headerIndex["A2"]] alleles = set([allele1, allele2]) if "0" in alleles: alleles.remove("0") allelesFiles[k][snpName] = alleles except IOError: msg = "%(fileName)s: no such file" % locals() raise ProgramError(msg) allelesFile1, allelesFile2 = allelesFiles # Finding the SNPs to flip toFlipOutputFile = None try: toFlipOutputFile = open(outPrefix + ".snp_to_flip_in_reference", "w") except IOError: msg = "%(outPrefix)s.snp_to_flip_in_reference: can't write " \ "file" % locals() raise ProgramError(msg) toRemoveOutputFile = None try: toRemoveOutputFile = open(outPrefix + ".snp_to_remove", "w") except IOError: msg = "%(outPrefix)s.snp_to_remove: can't write file" % locals() raise ProgramError(msg) for snpName in allelesFile1.iterkeys(): alleles1 = allelesFile1[snpName] alleles2 = allelesFile2[snpName] if (len(alleles1) == 2) and (len(alleles2) == 2): # Both are heterozygous if ({"A", "T"} == alleles1) or ({"C", "G"} == alleles1) or \ ({"A", "T"} == alleles2) or ({"C", "G"} == alleles2): # We can't flip those..., so we remove them print >>toRemoveOutputFile, snpName else: if alleles1 != alleles2: # Let's try the flip one if flipGenotype(alleles1) == alleles2: # We need to flip it print >>toFlipOutputFile, snpName else: # Those SNP are discordant... print >>toRemoveOutputFile, snpName else: # We want to remove this SNP, because there is at least one # homozygous individual print >>toRemoveOutputFile, snpName # Closing output files toFlipOutputFile.close() toRemoveOutputFile.close()
[docs]def computeFrequency(prefix, outPrefix): """Compute the frequency using Plink. :param prefix: the prefix of the file binary file for which we need to compute frequencies. :param outPrefix: the prefix of the output files. :type prefix: str :type outPrefix: str Uses Plink to compute the frequency of all the markers in the ``prefix`` binary file. """ plinkCommand = ["plink", "--noweb", "--bfile", prefix, "--freq", "--out", outPrefix] runCommand(plinkCommand)
[docs]def combinePlinkBinaryFiles(prefixes, outPrefix): """Combine Plink binary files. :param prefixes: a list of the prefix of the files that need to be combined. :param outPrefix: the prefix of the output file (the combined file). :type prefixes: list :type outPrefix: str It uses Plink to merge a list of binary files (which is a list of prefixes (strings)), and create the final data set which as ``outPrefix`` as the prefix. """ # The first file is the bfile, the others are the ones to merge outputFile = None try: outputFile = open(outPrefix + ".files_to_merge", "w") except IOError: msg = "%(outPrefix)s.filesToMerge: can't write file" % locals() raise ProgramError(msg) for prefix in prefixes[1:]: print >>outputFile, " ".join([ prefix + i for i in [".bed", ".bim", ".fam"] ]) # Closing the output files outputFile.close() # Runing plink plinkCommand = ["plink", "--noweb", "--bfile", prefixes[0], "--merge-list", outPrefix + ".files_to_merge", "--make-bed", "--out", outPrefix] runCommand(plinkCommand)
[docs]def findOverlappingSNPsWithReference(prefix, referencePrefixes, referencePopulations, outPrefix): """Find the overlapping SNPs in 4 different data sets. :param prefix: the prefix of all the files. :param referencePrefixes: the prefix of the reference population files. :param referencePopulations: the name of the reference population (same order as ``referencePrefixes``) :param outPrefix: the prefix of the output files. :type prefix: str :type referencePrefixes: list :type referencePopulations: list :type outPrefix: str It starts by reading the ``bim`` file of the source data set (``prefix.bim``). It finds all the markers (excluding the duplicated ones). Then it reads all of the reference population ``bim`` files (``referencePrefixes.bim``) and find all the markers that were found in the source data set. It creates three output files: * ``outPrefix.ref_snp_to_extract``: the name of the markers that needs to be extracted from the three reference panels. * ``outPrefix.source_snp_to_extract``: the name of the markers that needs to be extracted from the source panel. * ``outPrefix.update_names``: a file (readable by Plink) that will help in changing the names of the selected markers in the reference panels, so that they become comparable with the source panel. """ # Reading the main file sourceSnpToExtract = {} duplicates = set() try: with open(prefix + ".bim", "r") as inputFile: for line in inputFile: row = line.rstrip("\r\n").split("\t") chromosome = row[0] position = row[3] snpName = row[1] if (chromosome, position) not in sourceSnpToExtract: sourceSnpToExtract[(chromosome, position)] = snpName else: # It's a duplicate duplicates.add((chromosome, position)) except IOError: msg = "%s.bim: no such file" % prefix raise ProgramError(msg) # Removing duplicates from the list for snpID in duplicates: del sourceSnpToExtract[snpID] # Reading each of the reference markers refSnpToExtract = defaultdict(dict) refSnp = defaultdict(set) for refPop, refPrefix in izip(referencePopulations, referencePrefixes): try: with open(refPrefix + ".bim", "r") as inputFile: for line in inputFile: row = line.rstrip("\r\n").split("\t") chromosome = row[0] position = row[3] snpName = row[1] key = (chromosome, position) if key in sourceSnpToExtract: # We want this SNP refSnpToExtract[refPop][key] = snpName refSnp[refPop].add(key) logger.info(" - {:,d} overlaps with {}".format( len(refSnp[refPop]), refPrefix, )) except IOError: msg = "%(refPrefix)s.bim: no such file" % locals() raise ProgramError(msg) # Creating the intersect of the reference SNP refSnpIntersect = refSnp[referencePopulations[0]] for refPop in referencePopulations[1:]: refSnpIntersect &= refSnp[refPop] logger.info(" - {:,d} in common between reference " "panels".format(len(refSnpIntersect))) # Printing the names of the SNPs to extract in the reference refOutputFiles = {} try: for refPop in referencePopulations: refOutputFiles[refPop] = open( outPrefix + ".{}_snp_to_extract".format(refPop), mode="w", ) except IOError: msg = "{}.POP_snp_to_extract: can't write file".format(outPrefix) raise ProgramError(msg) sourceOutputFile = None try: sourceOutputFile = open(outPrefix + ".source_snp_to_extract", "w") except IOError: msg = "{}.source_snp_to_extract: can't write file".format(outPrefix) raise ProgramError(msg) changeNameOutputFiles = {} try: for refPop in referencePopulations: changeNameOutputFiles[refPop] = open( outPrefix + ".{}_update_names".format(refPop), mode="w", ) except IOError: msg = "%(outPrefix)s.updateNames: can't write file" % locals() raise ProgramError(msg) # Writing the file containing the SNPs to extract in the source for snpID in refSnpIntersect: print >>sourceOutputFile, sourceSnpToExtract[snpID] for refPop in referencePopulations: print >>refOutputFiles[refPop], refSnpToExtract[refPop][snpID] print >>changeNameOutputFiles[refPop], "\t".join([ refSnpToExtract[refPop][snpID], sourceSnpToExtract[snpID], ]) # Closing the output file sourceOutputFile.close() for refOutputFile in refOutputFiles.itervalues(): refOutputFile.close() for changeNameOutputFile in changeNameOutputFiles.itervalues(): changeNameOutputFile.close()
[docs]def extractSNPs(snpToExtractFileNames, referencePrefixes, popNames, outPrefix, runSGE, options): """Extract a list of SNPs using Plink. :param snpToExtractFileNames: the name of the files which contains the markers to extract from the original data set. :param referencePrefixes: a list containing the three reference population prefixes (the original data sets). :param popNames: a list containing the three reference population names. :param outPrefix: the prefix of the output file. :param runSGE: Whether using SGE or not. :param options: the options. :type snpToExtractFileNames: list :type referencePrefixes: list :type popNames: list :type outPrefix: str :type runSGE: boolean :type options: argparse.Namespace Using Plink, extract a set of markers from a list of prefixes. """ s = None jobIDs = [] jobTemplates = [] if runSGE: # Add the environment variable for DRMAA package if "DRMAA_LIBRARY_PATH" not in os.environ: msg = "could not load drmaa: set DRMAA_LIBRARY_PATH" raise ProgramError(msg) # Import the python drmaa library try: import drmaa except ImportError: raise ProgramError("drmaa is not install, install drmaa") # Initializing a session s = drmaa.Session() s.initialize() zipped = izip(popNames, referencePrefixes, snpToExtractFileNames) for popName, refPrefix, snpToExtractFileName in zipped: plinkCommand = ["plink", "--noweb", "--bfile", refPrefix, "--extract", snpToExtractFileName, "--make-bed", "--out", outPrefix + "." + popName] if runSGE: # We run using SGE # Creating the job template jt = s.createJobTemplate() jt.remoteCommand = plinkCommand[0] jt.workingDirectory = os.getcwd() jt.jobEnvironment = os.environ jt.args = plinkCommand[1:] jt.jobName = "_plink_extract_snps" # Cluster specifics if options.sge_walltime is not None: jt.hardWallclockTimeLimit = options.sge_walltime if options.sge_nodes is not None: native_spec = "-l nodes={}:ppn={}".format(options.sge_nodes[0], options.sge_nodes[1]) jt.nativeSpecification = native_spec # Running the job jobID = s.runJob(jt) # Storing the job template and the job ID jobTemplates.append(jt) jobIDs.append(jobID) else: # We run normal runCommand(plinkCommand) if runSGE: # We wait for all the jobs to be over hadProblems = [] for jobID in jobIDs: retVal = s.wait(jobID, drmaa.Session.TIMEOUT_WAIT_FOREVER) hadProblems.append(retVal.exitStatus == 0) # The jobs should be finished, so we clean everything # Deleting the job template, and exiting the session for jt in jobTemplates: s.deleteJobTemplate(jt) # Closing the connection s.exit() for hadProblem in hadProblems: if not hadProblem: msg = "Some SGE jobs had errors..." raise ProgramError(msg)
[docs]def runCommand(command): """Run a command. :param command: the command to run. :type command: list Tries to run a command. If it fails, raise a :py:class:`ProgramError`. This function uses the :py:mod:`subprocess` module. .. warning:: The variable ``command`` should be a list of strings (no other type). """ output = None try: output = subprocess.check_output( command, stderr=subprocess.STDOUT, shell=False, ) except subprocess.CalledProcessError: msg = "couldn't run command\n" + " ".join(command) raise ProgramError(msg)
[docs]def checkArgs(args): """Checks the arguments and options. :param args: an object containing the options of the program. :type args: argparse.Namespace :returns: ``True`` if everything was OK. If there is a problem with an option, an exception is raised using the :py:class:`ProgramError` class, a message is printed to the :py:class:`sys.stderr` and the program exists with code 1. """ # Check if we have the tped and the tfam files prefixes_to_check = [args.bfile] if not args.skip_ref_pops: for pop in ("ceu", "yri", "jpt_chb"): if vars(args)["{}_bfile".format(pop)] is None: raise ProgramError("argument --{}-bfile is " "required".format(pop.replace("_", "-"))) prefixes_to_check += [ args.ceu_bfile, args.yri_bfile, args.jpt_chb_bfile, ] for prefix in prefixes_to_check: if prefix is None: msg = "no input file" raise ProgramError(msg) for fileName in [prefix + i for i in [".bed", ".bim", ".fam"]]: if not os.path.isfile(fileName): msg = "%(fileName)s: no such file" % locals() raise ProgramError(msg) # Check the indep-pairwise option # The two first must be int, the last one float try: for i in xrange(2): tmp = int(args.indep_pairwise[i]) tmp = float(args.indep_pairwise[2]) except ValueError: msg = "indep-pairwise: need INT INT FLOAT" raise ProgramError(msg) # Check the maf value tmpMAF = None try: tmpMAF = float(args.maf) except ValueError: msg = "maf: must be a float, not %s" % args.maf raise ProgramError(msg) if (tmpMAF > 0.5) or (tmpMAF < 0.0): msg = "maf: must be between 0.0 and 0.5, not %s" % args.maf raise ProgramError(msg) # Check the number of line per file if args.line_per_file_for_sge < 1: msg = "line-per-file-for-sge: must be above 0, not " \ "%d" % args.line_per_file_for_sge raise ProgramError(msg) # Check the number of component to compute if args.nb_components < 2 or args.nb_components > 10: msg = ("nb-components: must be between 2 and 10 (inclusive), " "not {}".format(args.nb_components)) raise ProgramError(msg) # Check the minimum number of SNPs if args.min_nb_snp < 1: msg = "min-nb-snp: must be above 1" raise ProgramError(msg) return True
[docs]def parseArgs(argString=None): # pragma: no cover """Parses the command line options and arguments. :param argString: the options. :type argString: list :returns: A :py:class:`argparse.Namespace` object created by the :py:mod:`argparse` module. It contains the values of the different options. =========================== ====== ======================================== Options Type Description =========================== ====== ======================================== ``--bfile`` string The input file prefix (Plink binary file). ``--skip-ref-pops`` bool Perform the MDS computation, but skip the three reference panels. ``--ceu-bfile`` string The input file prefix for the CEU population (Plink binary file). ``--yri-bfile`` string The input file prefix for the YRI population (Plink binary file). ``--jpt-chb-bfile`` string The input file prefix for the JPT-CHB population (Plink binary file). ``--min-nb-snp`` int The minimum number of markers needed to compute IBS. ``--indep-pairwise`` string Three numbers: window size, window shift and the r2 threshold. ``--maf`` string Restrict to SNPs with MAF >= threshold. ``--sge`` bool Use SGE for parallelization. ``--sge-walltime`` int The time limit (for clusters). ``--sge-nodes`` int Two INTs (number of nodes and number of int processor per nodes). ``--ibs-sge-walltime`` int The time limit (for clusters) (for IBS) ``--ibs-sge-nodes`` int Two INTs (number of nodes and number of int processor per nodes) (for IBS). ``--line-per-file-for-sge`` int The number of line per file for SGE task array. ``--nb-components`` int The number of component to compute. ``--outliers-of`` string Finds the ouliers of this population. ``--multiplier`` float To find the outliers, we look for more than x times the cluster standard deviation. ``--xaxis`` string The component to use for the X axis. ``--yaxis`` string The component to use for the Y axis. ``--format`` string The output file format. ``--title`` string The title of the MDS plot. ``--xlabel`` string The label of the X axis. ``--ylabel`` string The label of the Y axis. ``--create-scree-plot`` bool Computes Eigenvalues and creates a scree plot. ``--scree-plot-title`` string The main title of the scree plot ``--out`` string The prefix of the output files. =========================== ====== ======================================== .. note:: No option check is done here (except for the one automatically done by argparse). Those need to be done elsewhere (see :py:func:`checkArgs`). """ args = None if argString is None: args = parser.parse_args() else: args = parser.parse_args(argString) return args
[docs]class ProgramError(Exception): """An :py:class:`Exception` raised in case of a problem. :param msg: the message to print to the user before exiting. :type msg: str """ def __init__(self, msg): """Construction of the :py:class:`ProgramError` class. :param msg: the message to print to the user :type msg: str """ self.message = str(msg) def __str__(self): return self.message
# The parser object pretty_name = "Ethnicity" desc = "Checks sample's ethnicity using reference populations and IBS." long_desc = ("The script uses pairwise IBS matrix as a distance metric to " "identify cryptic relatedness among samples and sample outliers " "by multi-dimensional scaling (MDS).") parser = argparse.ArgumentParser(description=desc) parser.add_argument("-v", "--version", action="version", version="pyGenClean version {}".format(__version__)) # The INPUT files group = parser.add_argument_group("Input File") group.add_argument("--bfile", type=str, metavar="FILE", required=True, help=("The input file prefix (will find the plink binary " "files by appending the prefix to the .bim, .bed and " ".fam files, respectively.")) group.add_argument("--skip-ref-pops", action="store_true", help=("Perform the MDS computation, but skip the three " "reference panels.")) group.add_argument("--ceu-bfile", type=str, metavar="FILE", help=("The input file prefix (will find the plink binary " "files by appending the prefix to the .bim, .bed and " ".fam files, respectively.) for the CEU population")) group.add_argument("--yri-bfile", type=str, metavar="FILE", help=("The input file prefix (will find the plink binary " "files by appending the prefix to the .bim, .bed and " ".fam files, respectively.) for the YRI population")) group.add_argument("--jpt-chb-bfile", type=str, metavar="FILE", help=("The input file prefix (will find the plink binary " "files by appending the prefix to the .bim, .bed and " ".fam files, respectively.) for the JPT-CHB " "population")) # The options group = parser.add_argument_group("Options") group.add_argument("--min-nb-snp", type=int, metavar="INT", default=8000, help=("The minimum number of markers needed to compute IBS " "values. [Default: %(default)d]")) group.add_argument("--indep-pairwise", type=str, metavar="STR", nargs=3, default=["50", "5", "0.1"], help=("Three numbers: window size, window shift and " "the r2 threshold. [default: %(default)s]")) group.add_argument("--maf", type=str, metavar="FLOAT", default="0.05", help=("Restrict to SNPs with MAF >= threshold. " "[default: %(default)s]")) group.add_argument("--sge", action="store_true", help="Use SGE for parallelization.") group.add_argument("--sge-walltime", type=str, metavar="TIME", help=("The walltime for the job to run on the cluster. Do " "not use if you are not required to specify a " "walltime for your jobs on your cluster (e.g. 'qsub " "-lwalltime=1:0:0' on the cluster).")) group.add_argument("--sge-nodes", type=int, metavar="INT", nargs=2, help=("The number of nodes and the number of processor per " "nodes to use (e.g. 'qsub -lnodes=X:ppn=Y' on the " "cluster, where X is the number of nodes and Y is " " the number of processor to use. Do not use if you " "are not required to specify the number of nodes for " "your jobs on the cluster.")) group.add_argument("--ibs-sge-walltime", type=str, metavar="TIME", help=("The walltime for the IBS jobs to run on the " "cluster. Do not use if you are not required to " "specify a walltime for your jobs on your cluster " "(e.g. 'qsub -lwalltime=1:0:0' on the cluster).")) group.add_argument("--ibs-sge-nodes", type=int, metavar="INT", nargs=2, help=("The number of nodes and the number of processor per " "nodes to use for the IBS jobs (e.g. 'qsub " "-lnodes=X:ppn=Y' on the cluster, where X is the " "number of nodes and Y is the number of processor to " "use. Do not use if you are not required to specify " "the number of nodes for your jobs on the cluster.")) group.add_argument("--line-per-file-for-sge", type=int, metavar="INT", default=100, help=("The number of line per file for SGE " "task array for the IBS jobs. [default: " "%(default)d]")) group.add_argument("--nb-components", type=int, metavar="INT", default=10, help=("The number of component to compute. [default: " "%(default)d]")) # The outlier options group = parser.add_argument_group("Outlier Options") find_outliers.add_custom_options(group) # The MDS plotting options group = parser.add_argument_group("MDS Plot Options") PlotMDS.addCustomOptions(group) # The Scree Plot options group = parser.add_argument_group("Scree Plot Options") group.add_argument("--create-scree-plot", action="store_true", help="Computes Eigenvalues and creates a scree plot.") PlotEigenvalues.add_custom_options(group) # The OUTPUT files group = parser.add_argument_group("Output File") group.add_argument("--out", type=str, metavar="FILE", default="ethnicity", help=("The prefix of the output files. [default: " "%(default)s]"))
[docs]def safe_main(): """A safe version of the main function (that catches ProgramError).""" try: main() except KeyboardInterrupt: logger.info("Cancelled by user") sys.exit(0) except ProgramError as e: logger.error(e.message) parser.error(e.message)
if __name__ == "__main__": safe_main()