Source code for pyGenClean.Ethnicity.find_outliers

#!/usr/bin/env python2.7

# This file is part of pyGenClean.
#
# pyGenClean is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# pyGenClean is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# pyGenClean.  If not, see <http://www.gnu.org/licenses/>.


import os
import re
import sys
import logging
import argparse
from glob import glob

import numpy as np

from .. import __version__
from ..PlinkUtils import createRowFromPlinkSpacedOutput as create_row


logger = logging.getLogger("find_outliers")


[docs]def main(argString=None):
    """The main function.

    :param argString: the options.

    :type argString: list of strings

    These are the steps of the modules:

    1. Prints the options.
    2. Reads the population file (:py:func:`read_population_file`).
    3. Reads the ``mds`` file (:py:func:`read_mds_file`).
    4. Computes the three reference population clusters' centers
       (:py:func:`find_ref_centers`).
    5. Computes three clusters according to the reference population clusters'
       centers, and finds the outliers of a given reference population
       (:py:func:`find_outliers`). This steps also produce three different
       plots.
    6. Writes outliers in a file (``prefix.outliers``).

    """
    # Getting and checking the options
    args = parseArgs(argString)
    checkArgs(args)

    logger.info("Options used:")
    for key, value in vars(args).iteritems():
        logger.info("  --{} {}".format(key.replace("_", "-"), value))

    # Reads the population file
    logger.info("Reading population file")
    populations = read_population_file(args.population_file)

    # Reads the MDS file
    logger.info("Reading MDS file")
    mds = read_mds_file(args.mds, args.xaxis, args.yaxis, populations)

    # Finds the population centers
    logger.info("Finding reference population centers")
    centers, center_info = find_ref_centers(mds)

    # Computes three clusters using KMeans and the reference cluster centers
    logger.info("Finding outliers")
    outliers = find_outliers(mds, centers, center_info, args.outliers_of, args)
    logger.info("  - There are {} outliers for the {} population".format(
        len(outliers),
        args.outliers_of,
    ))

    # Printing the outlier file
    try:
        with open(args.out + ".outliers", 'w') as output_file:
            for sample_id in outliers:
                print >>output_file, "\t".join(sample_id)
    except IOError:
        msg = "{}: can't write file".format(args.out + ".outliers")
        raise ProgramError(msg)

    # Printing the outlier population file
    try:
        with open(args.out + ".population_file_outliers", "w") as output_file:
            for sample_id, population in populations.iteritems():
                if sample_id in outliers:
                    population = "OUTLIER"
                print >>output_file, "\t".join(list(sample_id) + [population])
    except IOError:
        msg = "{}: can't write file".format(
            args.out + ".population_file_outliers",
        )
        raise ProgramError(msg)

    # If there is a summary file in the working directory (for LaTeX), we want
    # to modify it, because it means that this script is run after the pipeline
    # (to modify the multiplier, for example).
    if args.overwrite_tex:
        summary_file = glob(os.path.join(os.getcwd(), "*.summary.tex"))
        if len(summary_file) == 0:
            logger.warning("No TeX summary file found")
        if len(summary_file) > 1:
            raise ProgramError("More than one TeX summary file found")
        summary_file = summary_file[0]

        # Overwriting
        overwrite_tex(summary_file, len(outliers), args)


[docs]def overwrite_tex(tex_fn, nb_outliers, script_options):
    """Overwrites the TeX summary file with new values.

    :param tex_fn: the name of the TeX summary file to overwrite.
    :param nb_outliers: the number of outliers.
    :param script_options: the script options.

    :type tex_fn: str
    :type nb_outliers: int
    :type options: argparse.Namespace

    """
    # Getting the content of the TeX file
    content = None
    with open(tex_fn, "r") as i_file:
        content = i_file.read()

    # Is there a figure
    has_figure = re.search("includegraphics", content) is not None

    # Changing the first sentence
    content, n = re.subn(
        r"(Using\s[0-9,]+\smarkers?\sand\sa\smultiplier\sof\s)[0-9.]+"
        r"(,\sthere\swas\sa\stotal\sof\s)[0-9,]+(\soutliers?\sof\sthe\s)"
        r"\w+(\spopulation.)",
        r"\g<1>{}\g<2>{:,d}\g<3>{}\g<4>".format(
            script_options.multiplier,
            nb_outliers,
            script_options.outliers_of,
        ),
        content
    )
    if n != 1:
        raise ProgramError("{}: invalid TeX summary file".format(tex_fn))

    # Do we need to change the principal components in the text?
    c1_c2 = {"C1", "C2"}
    if has_figure and ({script_options.xaxis, script_options.yaxis} != c1_c2):
        content, n = re.subn(
            r"(shows\s)the\sfirst\stwo\sprincipal\scomponents(\sof\sthe\sMDS"
            r"\sanalysis)",
            r"\g<1>components {} versus {}\g<2>".format(
                script_options.yaxis.replace("C", ""),
                script_options.xaxis.replace("C", ""),
            ),
            content,
        )
        if n != 1:
            raise ProgramError("{}: invalid TeX summary file".format(tex_fn))

        content, n = re.subn(
            r"(MDS\splots\sshowing\s)the\sfirst\stwo\sprincipal\scomponents"
            r"(\sof\sthe\ssource\sdataset\swith\sthe\sreference\spanels.)",
            r"\g<1>components {} versus {}\g<2>".format(
                script_options.yaxis.replace("C", ""),
                script_options.xaxis.replace("C", ""),
            ),
            content,
        )
        if n != 1:
            raise ProgramError("{}: invalid TeX summary file".format(tex_fn))

    if has_figure:
        # Changing the population in the figure description
        content, n = re.subn(
            r"(where\soutliers\sof\sthe\s)\w+(\spopulation\sare\sshown\s"
            r"in\sgrey.)",
            r"\g<1>{}\g<2>".format(script_options.outliers_of),
            content,
        )
        if n != 1:
            raise ProgramError("{}: invalid TeX summary file".format(tex_fn))

        content, n = re.subn(
            r"(The\soutliers\sof\sthe\s)\w+(\spopulation\sare\sshown\sin\s"
            r"grey,\swhile\ssamples\sof\sthe\ssource\sdataset\sthat\s"
            r"resemble\sthe\s)\w+(\spopulation\sare\sshown\sin\sorange.\sA\s"
            r"multiplier\sof\s)[0-9.]+(\swas\sused\sto\sfind\sthe\s)[0-9,]+(\s"
            r"outliers?.)",
            r"\g<1>{pop}\g<2>{pop}\g<3>{mult}\g<4>{nb:,d}\g<5>".format(
                pop=script_options.outliers_of,
                mult=script_options.multiplier,
                nb=nb_outliers,
            ),
            content,
        )
        if n != 1:
            raise ProgramError("{}: invalid TeX summary file".format(tex_fn))

        # Changing the figure (path)
        content, n = re.subn(
            r"(\s\\includegraphics\[.+\]\{)\{ethnicity.outliers\}.png(\}\s)",
            r"\g<1>{}\g<2>".format(
                "{" + script_options.out + ".outliers}." +
                script_options.format
            ),
            content,
        )
        if n != 1:
            raise ProgramError("{}: invalid TeX summary file".format(tex_fn))

    # Saving the new content
    with open(tex_fn, "w") as o_file:
        o_file.write(content)


[docs]def find_outliers(mds, centers, center_info, ref_pop, options):
    """Finds the outliers for a given population.

    :param mds: the ``mds`` information about each samples.
    :param centers: the centers of the three reference population clusters.
    :param center_info: the label of the three reference population clusters.
    :param ref_pop: the reference population for which we need the outliers
                    from.
    :param options: the options

    :type mds: numpy.recarray
    :type centers: numpy.array
    :type center_info: dict
    :type ref_pop: str
    :type options: argparse.Namespace

    :returns: a :py:class:`set` of outliers from the ``ref_pop`` population.

    Perform a ``KMeans`` classification using the three centers from the three
    reference population cluster.

    Samples are outliers of the required reference population (``ref_pop``) if:

    * the sample is part of another reference population cluster;
    * the sample is an outlier of the desired reference population
      (``ref_pop``).

    A sample is an outlier of a given cluster :math:`C_j` if the distance
    between this sample and the center of the cluster :math:`C_j` (:math:`O_j`)
    is bigger than a constant times the cluster's standard deviation
    :math:`\\sigma_j`.

    .. math::
        \\sigma_j = \\sqrt{\\frac{\\sum{d(s_i,O_j)^2}}{||C_j|| - 1}}

    where :math:`||C_j||` is the number of samples in the cluster :math:`C_j`,
    and :math:`d(s_i,O_j)` is the distance between the sample :math:`s_i` and
    the center :math:`O_j` of the cluster :math:`C_j`.

    .. math::
        d(s_i, O_j) = \\sqrt{(x_{O_j} - x_{s_i})^2 + (y_{O_j} - y_{s_i})^2}

    Using a constant equals to one ensure we remove 100% of the outliers from
    the cluster. Using a constant of 1.6 or 1.9 ensures we remove 99% and 95%
    of outliers, respectively (an error rate of 1% and 5%, respectively).

    """
    # Importing matplotlib for plotting purposes
    import matplotlib as mpl
    if options.format != "X11" and mpl.get_backend() != "agg":
        mpl.use("Agg")
    import matplotlib.pyplot as plt
    if options.format != "X11":
        plt.ioff()
    import matplotlib.mlab as mlab

    from sklearn.cluster import KMeans
    from sklearn.metrics.pairwise import euclidean_distances

    # Formatting the data
    data = np.array(zip(mds["c1"], mds["c2"]))

    # Configuring and running the KMeans
    k_means = KMeans(init=centers, n_clusters=3, n_init=1)
    k_means.fit_predict(data)

    # Creating the figure and axes
    fig_before, axe_before = plt.subplots(1, 1)
    fig_after, axe_after = plt.subplots(1, 1)
    fig_outliers, axe_outliers = plt.subplots(1, 1)

    # Setting the axe
    axe_before.xaxis.set_ticks_position("bottom")
    axe_before.yaxis.set_ticks_position("left")
    axe_before.spines["top"].set_visible(False)
    axe_before.spines["right"].set_visible(False)
    axe_before.spines["bottom"].set_position(("outward", 9))
    axe_before.spines["left"].set_position(("outward", 9))
    axe_after.xaxis.set_ticks_position("bottom")
    axe_after.yaxis.set_ticks_position("left")
    axe_after.spines["top"].set_visible(False)
    axe_after.spines["right"].set_visible(False)
    axe_after.spines["bottom"].set_position(("outward", 9))
    axe_after.spines["left"].set_position(("outward", 9))
    axe_outliers.xaxis.set_ticks_position("bottom")
    axe_outliers.yaxis.set_ticks_position("left")
    axe_outliers.spines["top"].set_visible(False)
    axe_outliers.spines["right"].set_visible(False)
    axe_outliers.spines["bottom"].set_position(("outward", 9))
    axe_outliers.spines["left"].set_position(("outward", 9))

    # Setting the title and labels
    axe_before.set_title("Before finding outliers", weight="bold")
    axe_before.set_xlabel(options.xaxis)
    axe_before.set_ylabel(options.yaxis)
    axe_after.set_title("After finding outliers\n($> {} "
                        "\\sigma$)".format(options.multiplier), weight="bold")
    axe_after.set_xlabel(options.xaxis)
    axe_after.set_ylabel(options.yaxis)
    axe_outliers.set_title("Outliers", weight="bold")
    axe_outliers.set_xlabel(options.xaxis)
    axe_outliers.set_ylabel(options.yaxis)

    # The population name
    ref_pop_name = ["CEU", "YRI", "JPT-CHB"]

    # The colors
    colors = ["#CC0000", "#669900", "#0099CC"]
    outlier_colors = ["#FFCACA", "#E2F0B6", "#C5EAF8"]

    # Plotting each of the clusters with the initial center
    plots_before = []
    plots_after = []
    plots_outliers = []
    plot_outliers = None
    plot_not_outliers = None
    outliers_set = set()
    for label in xrange(3):
        # Subsetting the data
        subset_mds = mds[k_means.labels_ == label]
        subset_data = data[k_means.labels_ == label]

        # Plotting the cluster
        p, = axe_before.plot(subset_mds["c1"], subset_mds["c2"], "o",
                             mec=colors[label], mfc=colors[label], ms=2,
                             clip_on=False)
        plots_before.append(p)

        # Plotting the cluster center (the real one)
        axe_before.plot(centers[label][0], centers[label][1], "o",
                        mec="#000000", mfc="#FFBB33", ms=6, clip_on=False)

        # Computing the distances
        distances = euclidean_distances(subset_data, centers[label])

        # Finding the outliers (that are not in the reference populations
        sigma = np.sqrt(np.true_divide(np.sum(distances ** 2),
                                       len(distances) - 1))
        outliers = np.logical_and(
            (distances > options.multiplier * sigma).flatten(),
            subset_mds["pop"] != ref_pop_name[label],
        )
        logger.info("  - {} outliers for the {} cluster".format(
            np.sum(outliers),
            ref_pop_name[label],
        ))

        # Saving the outliers
        if ref_pop_name[label] != options.outliers_of:
            # This is not the population we want, hence everybody is an outlier
            # (we don't include the reference population).
            outlier_mds = subset_mds[subset_mds["pop"] != ref_pop_name[label]]
            outliers_set |= set([(i["fid"], i["iid"]) for i in outlier_mds])

            # Plotting all samples that are not part of the reference
            # populations
            axe_outliers.plot(
                subset_mds["c1"][subset_mds["pop"] != ref_pop_name[label]],
                subset_mds["c2"][subset_mds["pop"] != ref_pop_name[label]],
                "o",
                mec="#555555",
                mfc="#555555",
                ms=2,
                clip_on=False,
            )
        else:
            # This is the population we want, hence only the real outliers are
            # outliers (we don't include the reference population)
            outlier_mds = subset_mds[
                np.logical_and(subset_mds["pop"] != ref_pop_name[label],
                               outliers)
            ]

            # Plotting the outliers
            plot_outliers, = axe_outliers.plot(outlier_mds["c1"],
                                               outlier_mds["c2"], "o",
                                               mec="#555555", mfc="#555555",
                                               ms=2, clip_on=False)

            # Plotting the not outliers
            plot_not_outliers, = axe_outliers.plot(
                subset_mds["c1"][np.logical_and(
                        ~outliers,
                        subset_mds["pop"] != ref_pop_name[label]
                )],
                subset_mds["c2"][np.logical_and(
                    ~outliers,
                    subset_mds["pop"] != ref_pop_name[label]
                )],
                "o",
                mec="#FFBB33",
                mfc="#FFBB33",
                ms=2,
                clip_on=False,
            )
            outliers_set |= set([(i["fid"], i["iid"]) for i in outlier_mds])

        # Plotting the cluster (without outliers)
        p, = axe_after.plot(
            subset_mds[~outliers]["c1"],
            subset_mds[~outliers]["c2"],
            "o",
            mec=colors[label],
            mfc=colors[label],
            ms=2,
            clip_on=False,
        )
        plots_after.append(p)

        # Plotting the cluster (only outliers)
        axe_after.plot(subset_mds[outliers]["c1"], subset_mds[outliers]["c2"],
                       "o", mec=outlier_colors[label],
                       mfc=outlier_colors[label], ms=2, clip_on=False)

        # Plotting only the reference populations
        p, = axe_outliers.plot(
            subset_mds["c1"][subset_mds["pop"] == ref_pop_name[label]],
            subset_mds["c2"][subset_mds["pop"] == ref_pop_name[label]],
            "o",
            mec=colors[label],
            mfc=colors[label],
            ms=2,
            clip_on=False,
        )
        plots_outliers.append(p)

        # Plotting the cluster center (the real one)
        axe_after.plot(centers[label][0], centers[label][1], "o",
                       mec="#000000", mfc="#FFBB33", ms=6)

    # The legends
    axe_before.legend(plots_before, ref_pop_name, loc="best", numpoints=1,
                      fancybox=True, fontsize=8).get_frame().set_alpha(0.5)
    axe_after.legend(plots_after, ref_pop_name, loc="best", numpoints=1,
                     fancybox=True, fontsize=8).get_frame().set_alpha(0.5)
    axe_outliers.legend(plots_outliers + [plot_not_outliers, plot_outliers],
                        ref_pop_name + ["SOURCE", "OUTLIERS"], loc="best",
                        numpoints=1, fancybox=True,
                        fontsize=8).get_frame().set_alpha(0.5)

    # Saving the figure
    fig_before.savefig("{}.before.{}".format(options.out, options.format),
                       dpi=300)
    fig_after.savefig("{}.after.{}".format(options.out, options.format),
                      dpi=300)
    fig_outliers.savefig("{}.outliers.{}".format(options.out, options.format),
                         dpi=300)

    return outliers_set


[docs]def find_ref_centers(mds):
    """Finds the center of the three reference clusters.

    :param mds: the ``mds`` information about each samples.

    :type mds: numpy.recarray

    :returns: a tuple with a :py:class:`numpy.array` containing the centers of
              the three reference population cluster as first element, and a
              :py:class:`dict` containing the label of each of the three
              reference population clusters.

    First, we extract the ``mds`` values of each of the three reference
    populations. The, we compute the center of each of those clusters by
    computing the means.

    .. math::
        \\textrm{Cluster}_\\textrm{pop} = \\left(
            \\frac{\\sum_{i=1}^n x_i}{n}, \\frac{\\sum_{i=1}^n y_i}{n}
        \\right)

    """
    # Computing the centers of each of the reference clusters
    ceu_mds = mds[mds["pop"] == "CEU"]
    yri_mds = mds[mds["pop"] == "YRI"]
    asn_mds = mds[mds["pop"] == "JPT-CHB"]

    # Computing the centers
    centers = [[np.mean(ceu_mds["c1"]), np.mean(ceu_mds["c2"])],
               [np.mean(yri_mds["c1"]), np.mean(yri_mds["c2"])],
               [np.mean(asn_mds["c1"]), np.mean(asn_mds["c2"])]]

    return np.array(centers), {"CEU": 0, "YRI": 1, "JPT-CHB": 2}


[docs]def read_mds_file(file_name, c1, c2, pops):
    """Reads a MDS file.

    :param file_name: the name of the ``mds`` file.
    :param c1: the first component to read (x axis).
    :param c2: the second component to read (y axis).
    :param pops: the population of each sample.

    :type file_name: str
    :type c1: str
    :type c2: str
    :type pops: dict

    :returns: a :py:class:`numpy.recarray` (one sample per line) with the
              information about the family ID, the individual ID, the first
              component to extract, the second component to extract and the
              population.

    The ``mds`` file is the result of Plink (as produced by the
    :py:mod:`pyGenClean.Ethnicity.check_ethnicity` module).

    """
    mds = []
    max_fid = 0
    max_iid = 0
    with open(file_name, 'rb') as input_file:
        # Getting and checking the header
        header_index = dict([
            (col_name, i) for i, col_name in
            enumerate(create_row(input_file.readline()))
        ])
        for col_name in {"FID", "IID", c1, c2}:
            if col_name not in header_index:
                msg = "{}: no column named {}".format(file_name, col_name)
                raise ProgramError(msg)

        for row in map(create_row, input_file):
            # Getting the sample ID
            sample_id = (row[header_index["FID"]], row[header_index["IID"]])

            # Checking we have a population for this sample
            if sample_id not in pops:
                msg = "{} {}: not in population file".format(sample_id[0],
                                                             sample_id[1])
                raise ProgramError(msg)

            # Saving the data
            mds.append((sample_id[0], sample_id[1], row[header_index[c1]],
                        row[header_index[c2]], pops[sample_id]))

            # Cheking to find the max FID
            if len(sample_id[0]) > max_fid:
                max_fid = len(sample_id[0])
            if len(sample_id[1]) > max_iid:
                max_iid = len(sample_id[1])

    # Creating the numpy array
    mds = np.array(mds, dtype=[("fid", "a{}".format(max_fid)),
                               ("iid", "a{}".format(max_iid)),
                               ("c1", float), ("c2", float),
                               ("pop", "a7")])

    return mds


[docs]def read_population_file(file_name):
    """Reads the population file.

    :param file_name: the name of the population file.

    :type file_name: str

    :returns: a :py:class:`dict` containing the population for each of the
              samples.

    The population file should contain three columns:

    1. The family ID.
    2. The individual ID.
    3. The population of the file (one of ``CEU``, ``YRI``, ``JPT-CHB`` or
       ``SOURCE``).

    The outliers are from the ``SOURCE`` population, when compared to one of
    the three reference population (``CEU``, ``YRI`` or ``JPT-CHB``).

    """
    pops = {}
    required_pops = {"CEU", "YRI", "JPT-CHB", "SOURCE"}
    with open(file_name, 'rb') as input_file:
        for line in input_file:
            row = line.rstrip("\r\n").split("\t")

            # The data
            sample_id = tuple(row[:2])
            pop = row[-1]

            # Checking the pop
            if pop not in required_pops:
                msg = ("{}: sample {}: unknown population "
                       "{}".format(file_name, " ".join(sample_id), pop))
                raise ProgramError(msg)

            # Saving the population file
            pops[tuple(row[:2])] = row[-1]

    return pops


[docs]def checkArgs(args):
    """Checks the arguments and options.

    :param args: a :py:class:`argparse.Namespace` object containing the options
                 of the program.

    :type args: argparse.Namespace

    :returns: ``True`` if everything was OK.

    If there is a problem with an option, an exception is raised using the
    :py:class:`ProgramError` class, a message is printed to the
    :class:`sys.stderr` and the program exists with code 1.

    """
    # Checking the input files
    if not os.path.isfile(args.mds):
        msg = "{}: no such file".format(args.mds)
        raise ProgramError(msg)

    if not os.path.isfile(args.population_file):
        msg = "{}: no such file".format(args.population_file)
        raise ProgramError(msg)

    # Checking the chosen components
    if args.xaxis == args.yaxis:
        msg = "xaxis must be different than yaxis"
        raise ProgramError(msg)

    return True


[docs]def parseArgs(argString=None):  # pragma: no cover
    """Parses the command line options and arguments.

    :param argString: the options.

    :type argString: list

    :returns: A :py:class:`argparse.Namespace` object created by the
              :py:mod:`argparse` module. It contains the values of the
              different options.

    ===================== ====== ==============================================
            Options        Type                   Description
    ===================== ====== ==============================================
    ``--mds``             string The MDS file from Plink.
    ``--population-file`` string A population file from
                                 :py:mod:`pyGenClean.Ethnicity.check_ethnicity`
                                 module.
    ``--format``          string The output file format (png, ps, or pdf.
    ``--outliers-of``     string Finds the outliers of this population.
    ``--multiplier``      float  To find the outliers, we look for more than
                                 :math:`x` times the cluster standard
                                 deviation.
    ``--xaxis``           string The component to use for the X axis.
    ``--yaxis``           string The component to use for the Y axis.
    ``--format``          string The output file format (png, ps, or pdf
                                 formats are available).
    ``--out``             string The prefix of the output files.
    ===================== ====== ==============================================

    .. note::
        No option check is done here (except for the one automatically done by
        argparse). Those need to be done elsewhere (see :py:func:`checkArgs`).

    """
    args = None
    if argString is None:
        args = parser.parse_args()
    else:
        args = parser.parse_args(argString)

    return args


[docs]def add_custom_options(parser):
    """Adds custom options to a parser.

    :param parser: the parser to which to add options.

    :type parser: argparse.ArgumentParser

    """
    parser.add_argument("--outliers-of", type=str, metavar="POP",
                        default="CEU", choices=["CEU", "YRI", "JPT-CHB"],
                        help=("Finds the outliers of this population. "
                              "[default: %(default)s]"))
    parser.add_argument("--multiplier", type=float, metavar="FLOAT",
                        default=1.9,
                        help=("To find the outliers, we look for more than "
                              "x times the cluster standard deviation. "
                              "[default: %(default).1f]"))
    parser.add_argument("--xaxis", type=str, metavar="COMPONENT", default="C1",
                        choices=["C{}".format(i) for i in xrange(1, 11)],
                        help=("The component to use for the X axis. "
                              "[default: %(default)s]"))
    parser.add_argument("--yaxis", type=str, metavar="COMPONENT", default="C2",
                        choices=["C{}".format(i) for i in xrange(1, 11)],
                        help=("The component to use for the Y axis. "
                              "[default: %(default)s]"))


[docs]class ProgramError(Exception):
    """An :py:class:`Exception` raised in case of a problem.

    :param msg: the message to print to the user before exiting.

    :type msg: str

    """
    def __init__(self, msg):
        """Construction of the :py:class:`ProgramError` class.

        :param msg: the message to print to the user

        :type msg: str

        """
        self.message = str(msg)

    def __str__(self):
        return self.message

# The parser object
desc = "Finds outliers in SOURCE from CEU samples."
long_desc = None
parser = argparse.ArgumentParser(description=desc)
parser.add_argument("-v", "--version", action="version",
                    version="pyGenClean version {}".format(__version__))


# The INPUT files
group = parser.add_argument_group("Input File")
group.add_argument("--mds", type=str, metavar="FILE", required=True,
                   help=("The MDS file from Plink"))
group.add_argument("--population-file", type=str, metavar="FILE",
                   required=True,
                   help=("A population file containing the following columns "
                         "(without a header): FID, IID and POP. POP should be "
                         "one of 'CEU', 'JPT-CHB', 'YRI' and SOURCE."))
# The options
group = parser.add_argument_group("Options")
add_custom_options(group)
group.add_argument("--format", type=str, metavar="FORMAT", default="png",
                   choices=["png", "ps", "pdf"],
                   help=("The output file format (png, ps, or pdf "
                         "formats are available). [default: %(default)s]"))
group.add_argument("--overwrite-tex", action="store_true",
                   help=("Using this option will overwrite any file that "
                         "match '*.summary.tex' (if any). This file is "
                         "automatically generated by the pyGenClean main "
                         "pipeline."))
# The OUTPUT files
group = parser.add_argument_group("Output File")
group.add_argument("--out", type=str, metavar="FILE",
                   default="ethnicity",
                   help=("The prefix of the output files. [default: "
                         "%(default)s]"))


[docs]def safe_main():
    """A safe version of the main function (that catches ProgramError)."""
    try:
        main()
    except KeyboardInterrupt:
        logger.info("Cancelled by user")
        sys.exit(0)
    except ProgramError as e:
        logger.error(e.message)
        parser.error(e.message)


if __name__ == "__main__":
    safe_main()