Source code for pyGenClean.Ethnicity.find_outliers

#!/usr/bin/env python2.7

# This file is part of pyGenClean.
#
# pyGenClean is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# pyGenClean is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# pyGenClean.  If not, see <http://www.gnu.org/licenses/>.


import os
import re
import sys
import logging
import argparse
from glob import glob

import numpy as np

from .. import __version__
from ..PlinkUtils import createRowFromPlinkSpacedOutput as create_row


logger = logging.getLogger("find_outliers")


[docs]def main(argString=None): """The main function. :param argString: the options. :type argString: list of strings These are the steps of the modules: 1. Prints the options. 2. Reads the population file (:py:func:`read_population_file`). 3. Reads the ``mds`` file (:py:func:`read_mds_file`). 4. Computes the three reference population clusters' centers (:py:func:`find_ref_centers`). 5. Computes three clusters according to the reference population clusters' centers, and finds the outliers of a given reference population (:py:func:`find_outliers`). This steps also produce three different plots. 6. Writes outliers in a file (``prefix.outliers``). """ # Getting and checking the options args = parseArgs(argString) checkArgs(args) logger.info("Options used:") for key, value in vars(args).iteritems(): logger.info(" --{} {}".format(key.replace("_", "-"), value)) # Reads the population file logger.info("Reading population file") populations = read_population_file(args.population_file) # Reads the MDS file logger.info("Reading MDS file") mds = read_mds_file(args.mds, args.xaxis, args.yaxis, populations) # Finds the population centers logger.info("Finding reference population centers") centers, center_info = find_ref_centers(mds) # Computes three clusters using KMeans and the reference cluster centers logger.info("Finding outliers") outliers = find_outliers(mds, centers, center_info, args.outliers_of, args) logger.info(" - There are {} outliers for the {} population".format( len(outliers), args.outliers_of, )) # Printing the outlier file try: with open(args.out + ".outliers", 'w') as output_file: for sample_id in outliers: print >>output_file, "\t".join(sample_id) except IOError: msg = "{}: can't write file".format(args.out + ".outliers") raise ProgramError(msg) # Printing the outlier population file try: with open(args.out + ".population_file_outliers", "w") as output_file: for sample_id, population in populations.iteritems(): if sample_id in outliers: population = "OUTLIER" print >>output_file, "\t".join(list(sample_id) + [population]) except IOError: msg = "{}: can't write file".format( args.out + ".population_file_outliers", ) raise ProgramError(msg) # If there is a summary file in the working directory (for LaTeX), we want # to modify it, because it means that this script is run after the pipeline # (to modify the multiplier, for example). if args.overwrite_tex: summary_file = glob(os.path.join(os.getcwd(), "*.summary.tex")) if len(summary_file) == 0: logger.warning("No TeX summary file found") if len(summary_file) > 1: raise ProgramError("More than one TeX summary file found") summary_file = summary_file[0] # Overwriting overwrite_tex(summary_file, len(outliers), args)
[docs]def overwrite_tex(tex_fn, nb_outliers, script_options): """Overwrites the TeX summary file with new values. :param tex_fn: the name of the TeX summary file to overwrite. :param nb_outliers: the number of outliers. :param script_options: the script options. :type tex_fn: str :type nb_outliers: int :type options: argparse.Namespace """ # Getting the content of the TeX file content = None with open(tex_fn, "r") as i_file: content = i_file.read() # Is there a figure has_figure = re.search("includegraphics", content) is not None # Changing the first sentence content, n = re.subn( r"(Using\s[0-9,]+\smarkers?\sand\sa\smultiplier\sof\s)[0-9.]+" r"(,\sthere\swas\sa\stotal\sof\s)[0-9,]+(\soutliers?\sof\sthe\s)" r"\w+(\spopulation.)", r"\g<1>{}\g<2>{:,d}\g<3>{}\g<4>".format( script_options.multiplier, nb_outliers, script_options.outliers_of, ), content ) if n != 1: raise ProgramError("{}: invalid TeX summary file".format(tex_fn)) # Do we need to change the principal components in the text? c1_c2 = {"C1", "C2"} if has_figure and ({script_options.xaxis, script_options.yaxis} != c1_c2): content, n = re.subn( r"(shows\s)the\sfirst\stwo\sprincipal\scomponents(\sof\sthe\sMDS" r"\sanalysis)", r"\g<1>components {} versus {}\g<2>".format( script_options.yaxis.replace("C", ""), script_options.xaxis.replace("C", ""), ), content, ) if n != 1: raise ProgramError("{}: invalid TeX summary file".format(tex_fn)) content, n = re.subn( r"(MDS\splots\sshowing\s)the\sfirst\stwo\sprincipal\scomponents" r"(\sof\sthe\ssource\sdataset\swith\sthe\sreference\spanels.)", r"\g<1>components {} versus {}\g<2>".format( script_options.yaxis.replace("C", ""), script_options.xaxis.replace("C", ""), ), content, ) if n != 1: raise ProgramError("{}: invalid TeX summary file".format(tex_fn)) if has_figure: # Changing the population in the figure description content, n = re.subn( r"(where\soutliers\sof\sthe\s)\w+(\spopulation\sare\sshown\s" r"in\sgrey.)", r"\g<1>{}\g<2>".format(script_options.outliers_of), content, ) if n != 1: raise ProgramError("{}: invalid TeX summary file".format(tex_fn)) content, n = re.subn( r"(The\soutliers\sof\sthe\s)\w+(\spopulation\sare\sshown\sin\s" r"grey,\swhile\ssamples\sof\sthe\ssource\sdataset\sthat\s" r"resemble\sthe\s)\w+(\spopulation\sare\sshown\sin\sorange.\sA\s" r"multiplier\sof\s)[0-9.]+(\swas\sused\sto\sfind\sthe\s)[0-9,]+(\s" r"outliers?.)", r"\g<1>{pop}\g<2>{pop}\g<3>{mult}\g<4>{nb:,d}\g<5>".format( pop=script_options.outliers_of, mult=script_options.multiplier, nb=nb_outliers, ), content, ) if n != 1: raise ProgramError("{}: invalid TeX summary file".format(tex_fn)) # Changing the figure (path) content, n = re.subn( r"(\s\\includegraphics\[.+\]\{)\{ethnicity.outliers\}.png(\}\s)", r"\g<1>{}\g<2>".format( "{" + script_options.out + ".outliers}." + script_options.format ), content, ) if n != 1: raise ProgramError("{}: invalid TeX summary file".format(tex_fn)) # Saving the new content with open(tex_fn, "w") as o_file: o_file.write(content)
[docs]def find_outliers(mds, centers, center_info, ref_pop, options): """Finds the outliers for a given population. :param mds: the ``mds`` information about each samples. :param centers: the centers of the three reference population clusters. :param center_info: the label of the three reference population clusters. :param ref_pop: the reference population for which we need the outliers from. :param options: the options :type mds: numpy.recarray :type centers: numpy.array :type center_info: dict :type ref_pop: str :type options: argparse.Namespace :returns: a :py:class:`set` of outliers from the ``ref_pop`` population. Perform a ``KMeans`` classification using the three centers from the three reference population cluster. Samples are outliers of the required reference population (``ref_pop``) if: * the sample is part of another reference population cluster; * the sample is an outlier of the desired reference population (``ref_pop``). A sample is an outlier of a given cluster :math:`C_j` if the distance between this sample and the center of the cluster :math:`C_j` (:math:`O_j`) is bigger than a constant times the cluster's standard deviation :math:`\\sigma_j`. .. math:: \\sigma_j = \\sqrt{\\frac{\\sum{d(s_i,O_j)^2}}{||C_j|| - 1}} where :math:`||C_j||` is the number of samples in the cluster :math:`C_j`, and :math:`d(s_i,O_j)` is the distance between the sample :math:`s_i` and the center :math:`O_j` of the cluster :math:`C_j`. .. math:: d(s_i, O_j) = \\sqrt{(x_{O_j} - x_{s_i})^2 + (y_{O_j} - y_{s_i})^2} Using a constant equals to one ensure we remove 100% of the outliers from the cluster. Using a constant of 1.6 or 1.9 ensures we remove 99% and 95% of outliers, respectively (an error rate of 1% and 5%, respectively). """ # Importing matplotlib for plotting purposes import matplotlib as mpl if options.format != "X11" and mpl.get_backend() != "agg": mpl.use("Agg") import matplotlib.pyplot as plt if options.format != "X11": plt.ioff() import matplotlib.mlab as mlab from sklearn.cluster import KMeans from sklearn.metrics.pairwise import euclidean_distances # Formatting the data data = np.array(zip(mds["c1"], mds["c2"])) # Configuring and running the KMeans k_means = KMeans(init=centers, n_clusters=3, n_init=1) k_means.fit_predict(data) # Creating the figure and axes fig_before, axe_before = plt.subplots(1, 1) fig_after, axe_after = plt.subplots(1, 1) fig_outliers, axe_outliers = plt.subplots(1, 1) # Setting the axe axe_before.xaxis.set_ticks_position("bottom") axe_before.yaxis.set_ticks_position("left") axe_before.spines["top"].set_visible(False) axe_before.spines["right"].set_visible(False) axe_before.spines["bottom"].set_position(("outward", 9)) axe_before.spines["left"].set_position(("outward", 9)) axe_after.xaxis.set_ticks_position("bottom") axe_after.yaxis.set_ticks_position("left") axe_after.spines["top"].set_visible(False) axe_after.spines["right"].set_visible(False) axe_after.spines["bottom"].set_position(("outward", 9)) axe_after.spines["left"].set_position(("outward", 9)) axe_outliers.xaxis.set_ticks_position("bottom") axe_outliers.yaxis.set_ticks_position("left") axe_outliers.spines["top"].set_visible(False) axe_outliers.spines["right"].set_visible(False) axe_outliers.spines["bottom"].set_position(("outward", 9)) axe_outliers.spines["left"].set_position(("outward", 9)) # Setting the title and labels axe_before.set_title("Before finding outliers", weight="bold") axe_before.set_xlabel(options.xaxis) axe_before.set_ylabel(options.yaxis) axe_after.set_title("After finding outliers\n($> {} " "\\sigma$)".format(options.multiplier), weight="bold") axe_after.set_xlabel(options.xaxis) axe_after.set_ylabel(options.yaxis) axe_outliers.set_title("Outliers", weight="bold") axe_outliers.set_xlabel(options.xaxis) axe_outliers.set_ylabel(options.yaxis) # The population name ref_pop_name = ["CEU", "YRI", "JPT-CHB"] # The colors colors = ["#CC0000", "#669900", "#0099CC"] outlier_colors = ["#FFCACA", "#E2F0B6", "#C5EAF8"] # Plotting each of the clusters with the initial center plots_before = [] plots_after = [] plots_outliers = [] plot_outliers = None plot_not_outliers = None outliers_set = set() for label in xrange(3): # Subsetting the data subset_mds = mds[k_means.labels_ == label] subset_data = data[k_means.labels_ == label] # Plotting the cluster p, = axe_before.plot(subset_mds["c1"], subset_mds["c2"], "o", mec=colors[label], mfc=colors[label], ms=2, clip_on=False) plots_before.append(p) # Plotting the cluster center (the real one) axe_before.plot(centers[label][0], centers[label][1], "o", mec="#000000", mfc="#FFBB33", ms=6, clip_on=False) # Computing the distances distances = euclidean_distances(subset_data, centers[label]) # Finding the outliers (that are not in the reference populations sigma = np.sqrt(np.true_divide(np.sum(distances ** 2), len(distances) - 1)) outliers = np.logical_and( (distances > options.multiplier * sigma).flatten(), subset_mds["pop"] != ref_pop_name[label], ) logger.info(" - {} outliers for the {} cluster".format( np.sum(outliers), ref_pop_name[label], )) # Saving the outliers if ref_pop_name[label] != options.outliers_of: # This is not the population we want, hence everybody is an outlier # (we don't include the reference population). outlier_mds = subset_mds[subset_mds["pop"] != ref_pop_name[label]] outliers_set |= set([(i["fid"], i["iid"]) for i in outlier_mds]) # Plotting all samples that are not part of the reference # populations axe_outliers.plot( subset_mds["c1"][subset_mds["pop"] != ref_pop_name[label]], subset_mds["c2"][subset_mds["pop"] != ref_pop_name[label]], "o", mec="#555555", mfc="#555555", ms=2, clip_on=False, ) else: # This is the population we want, hence only the real outliers are # outliers (we don't include the reference population) outlier_mds = subset_mds[ np.logical_and(subset_mds["pop"] != ref_pop_name[label], outliers) ] # Plotting the outliers plot_outliers, = axe_outliers.plot(outlier_mds["c1"], outlier_mds["c2"], "o", mec="#555555", mfc="#555555", ms=2, clip_on=False) # Plotting the not outliers plot_not_outliers, = axe_outliers.plot( subset_mds["c1"][np.logical_and( ~outliers, subset_mds["pop"] != ref_pop_name[label] )], subset_mds["c2"][np.logical_and( ~outliers, subset_mds["pop"] != ref_pop_name[label] )], "o", mec="#FFBB33", mfc="#FFBB33", ms=2, clip_on=False, ) outliers_set |= set([(i["fid"], i["iid"]) for i in outlier_mds]) # Plotting the cluster (without outliers) p, = axe_after.plot( subset_mds[~outliers]["c1"], subset_mds[~outliers]["c2"], "o", mec=colors[label], mfc=colors[label], ms=2, clip_on=False, ) plots_after.append(p) # Plotting the cluster (only outliers) axe_after.plot(subset_mds[outliers]["c1"], subset_mds[outliers]["c2"], "o", mec=outlier_colors[label], mfc=outlier_colors[label], ms=2, clip_on=False) # Plotting only the reference populations p, = axe_outliers.plot( subset_mds["c1"][subset_mds["pop"] == ref_pop_name[label]], subset_mds["c2"][subset_mds["pop"] == ref_pop_name[label]], "o", mec=colors[label], mfc=colors[label], ms=2, clip_on=False, ) plots_outliers.append(p) # Plotting the cluster center (the real one) axe_after.plot(centers[label][0], centers[label][1], "o", mec="#000000", mfc="#FFBB33", ms=6) # The legends axe_before.legend(plots_before, ref_pop_name, loc="best", numpoints=1, fancybox=True, fontsize=8).get_frame().set_alpha(0.5) axe_after.legend(plots_after, ref_pop_name, loc="best", numpoints=1, fancybox=True, fontsize=8).get_frame().set_alpha(0.5) axe_outliers.legend(plots_outliers + [plot_not_outliers, plot_outliers], ref_pop_name + ["SOURCE", "OUTLIERS"], loc="best", numpoints=1, fancybox=True, fontsize=8).get_frame().set_alpha(0.5) # Saving the figure fig_before.savefig("{}.before.{}".format(options.out, options.format), dpi=300) fig_after.savefig("{}.after.{}".format(options.out, options.format), dpi=300) fig_outliers.savefig("{}.outliers.{}".format(options.out, options.format), dpi=300) return outliers_set
[docs]def find_ref_centers(mds): """Finds the center of the three reference clusters. :param mds: the ``mds`` information about each samples. :type mds: numpy.recarray :returns: a tuple with a :py:class:`numpy.array` containing the centers of the three reference population cluster as first element, and a :py:class:`dict` containing the label of each of the three reference population clusters. First, we extract the ``mds`` values of each of the three reference populations. The, we compute the center of each of those clusters by computing the means. .. math:: \\textrm{Cluster}_\\textrm{pop} = \\left( \\frac{\\sum_{i=1}^n x_i}{n}, \\frac{\\sum_{i=1}^n y_i}{n} \\right) """ # Computing the centers of each of the reference clusters ceu_mds = mds[mds["pop"] == "CEU"] yri_mds = mds[mds["pop"] == "YRI"] asn_mds = mds[mds["pop"] == "JPT-CHB"] # Computing the centers centers = [[np.mean(ceu_mds["c1"]), np.mean(ceu_mds["c2"])], [np.mean(yri_mds["c1"]), np.mean(yri_mds["c2"])], [np.mean(asn_mds["c1"]), np.mean(asn_mds["c2"])]] return np.array(centers), {"CEU": 0, "YRI": 1, "JPT-CHB": 2}
[docs]def read_mds_file(file_name, c1, c2, pops): """Reads a MDS file. :param file_name: the name of the ``mds`` file. :param c1: the first component to read (x axis). :param c2: the second component to read (y axis). :param pops: the population of each sample. :type file_name: str :type c1: str :type c2: str :type pops: dict :returns: a :py:class:`numpy.recarray` (one sample per line) with the information about the family ID, the individual ID, the first component to extract, the second component to extract and the population. The ``mds`` file is the result of Plink (as produced by the :py:mod:`pyGenClean.Ethnicity.check_ethnicity` module). """ mds = [] max_fid = 0 max_iid = 0 with open(file_name, 'rb') as input_file: # Getting and checking the header header_index = dict([ (col_name, i) for i, col_name in enumerate(create_row(input_file.readline())) ]) for col_name in {"FID", "IID", c1, c2}: if col_name not in header_index: msg = "{}: no column named {}".format(file_name, col_name) raise ProgramError(msg) for row in map(create_row, input_file): # Getting the sample ID sample_id = (row[header_index["FID"]], row[header_index["IID"]]) # Checking we have a population for this sample if sample_id not in pops: msg = "{} {}: not in population file".format(sample_id[0], sample_id[1]) raise ProgramError(msg) # Saving the data mds.append((sample_id[0], sample_id[1], row[header_index[c1]], row[header_index[c2]], pops[sample_id])) # Cheking to find the max FID if len(sample_id[0]) > max_fid: max_fid = len(sample_id[0]) if len(sample_id[1]) > max_iid: max_iid = len(sample_id[1]) # Creating the numpy array mds = np.array(mds, dtype=[("fid", "a{}".format(max_fid)), ("iid", "a{}".format(max_iid)), ("c1", float), ("c2", float), ("pop", "a7")]) return mds
[docs]def read_population_file(file_name): """Reads the population file. :param file_name: the name of the population file. :type file_name: str :returns: a :py:class:`dict` containing the population for each of the samples. The population file should contain three columns: 1. The family ID. 2. The individual ID. 3. The population of the file (one of ``CEU``, ``YRI``, ``JPT-CHB`` or ``SOURCE``). The outliers are from the ``SOURCE`` population, when compared to one of the three reference population (``CEU``, ``YRI`` or ``JPT-CHB``). """ pops = {} required_pops = {"CEU", "YRI", "JPT-CHB", "SOURCE"} with open(file_name, 'rb') as input_file: for line in input_file: row = line.rstrip("\r\n").split("\t") # The data sample_id = tuple(row[:2]) pop = row[-1] # Checking the pop if pop not in required_pops: msg = ("{}: sample {}: unknown population " "{}".format(file_name, " ".join(sample_id), pop)) raise ProgramError(msg) # Saving the population file pops[tuple(row[:2])] = row[-1] return pops
[docs]def checkArgs(args): """Checks the arguments and options. :param args: a :py:class:`argparse.Namespace` object containing the options of the program. :type args: argparse.Namespace :returns: ``True`` if everything was OK. If there is a problem with an option, an exception is raised using the :py:class:`ProgramError` class, a message is printed to the :class:`sys.stderr` and the program exists with code 1. """ # Checking the input files if not os.path.isfile(args.mds): msg = "{}: no such file".format(args.mds) raise ProgramError(msg) if not os.path.isfile(args.population_file): msg = "{}: no such file".format(args.population_file) raise ProgramError(msg) # Checking the chosen components if args.xaxis == args.yaxis: msg = "xaxis must be different than yaxis" raise ProgramError(msg) return True
[docs]def parseArgs(argString=None): # pragma: no cover """Parses the command line options and arguments. :param argString: the options. :type argString: list :returns: A :py:class:`argparse.Namespace` object created by the :py:mod:`argparse` module. It contains the values of the different options. ===================== ====== ============================================== Options Type Description ===================== ====== ============================================== ``--mds`` string The MDS file from Plink. ``--population-file`` string A population file from :py:mod:`pyGenClean.Ethnicity.check_ethnicity` module. ``--format`` string The output file format (png, ps, or pdf. ``--outliers-of`` string Finds the outliers of this population. ``--multiplier`` float To find the outliers, we look for more than :math:`x` times the cluster standard deviation. ``--xaxis`` string The component to use for the X axis. ``--yaxis`` string The component to use for the Y axis. ``--format`` string The output file format (png, ps, or pdf formats are available). ``--out`` string The prefix of the output files. ===================== ====== ============================================== .. note:: No option check is done here (except for the one automatically done by argparse). Those need to be done elsewhere (see :py:func:`checkArgs`). """ args = None if argString is None: args = parser.parse_args() else: args = parser.parse_args(argString) return args
[docs]def add_custom_options(parser): """Adds custom options to a parser. :param parser: the parser to which to add options. :type parser: argparse.ArgumentParser """ parser.add_argument("--outliers-of", type=str, metavar="POP", default="CEU", choices=["CEU", "YRI", "JPT-CHB"], help=("Finds the outliers of this population. " "[default: %(default)s]")) parser.add_argument("--multiplier", type=float, metavar="FLOAT", default=1.9, help=("To find the outliers, we look for more than " "x times the cluster standard deviation. " "[default: %(default).1f]")) parser.add_argument("--xaxis", type=str, metavar="COMPONENT", default="C1", choices=["C{}".format(i) for i in xrange(1, 11)], help=("The component to use for the X axis. " "[default: %(default)s]")) parser.add_argument("--yaxis", type=str, metavar="COMPONENT", default="C2", choices=["C{}".format(i) for i in xrange(1, 11)], help=("The component to use for the Y axis. " "[default: %(default)s]"))
[docs]class ProgramError(Exception): """An :py:class:`Exception` raised in case of a problem. :param msg: the message to print to the user before exiting. :type msg: str """ def __init__(self, msg): """Construction of the :py:class:`ProgramError` class. :param msg: the message to print to the user :type msg: str """ self.message = str(msg) def __str__(self): return self.message
# The parser object desc = "Finds outliers in SOURCE from CEU samples." long_desc = None parser = argparse.ArgumentParser(description=desc) parser.add_argument("-v", "--version", action="version", version="pyGenClean version {}".format(__version__)) # The INPUT files group = parser.add_argument_group("Input File") group.add_argument("--mds", type=str, metavar="FILE", required=True, help=("The MDS file from Plink")) group.add_argument("--population-file", type=str, metavar="FILE", required=True, help=("A population file containing the following columns " "(without a header): FID, IID and POP. POP should be " "one of 'CEU', 'JPT-CHB', 'YRI' and SOURCE.")) # The options group = parser.add_argument_group("Options") add_custom_options(group) group.add_argument("--format", type=str, metavar="FORMAT", default="png", choices=["png", "ps", "pdf"], help=("The output file format (png, ps, or pdf " "formats are available). [default: %(default)s]")) group.add_argument("--overwrite-tex", action="store_true", help=("Using this option will overwrite any file that " "match '*.summary.tex' (if any). This file is " "automatically generated by the pyGenClean main " "pipeline.")) # The OUTPUT files group = parser.add_argument_group("Output File") group.add_argument("--out", type=str, metavar="FILE", default="ethnicity", help=("The prefix of the output files. [default: " "%(default)s]"))
[docs]def safe_main(): """A safe version of the main function (that catches ProgramError).""" try: main() except KeyboardInterrupt: logger.info("Cancelled by user") sys.exit(0) except ProgramError as e: logger.error(e.message) parser.error(e.message)
if __name__ == "__main__": safe_main()