Source code for pyGenClean.SexCheck.gender_plot

#!/usr/bin/env python2.7

# This file is part of pyGenClean.
#
# pyGenClean is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# pyGenClean is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# pyGenClean.  If not, see <http://www.gnu.org/licenses/>.


import os
import sys
import gzip
import logging
import argparse

import numpy as np

from .. import __version__


logger = logging.getLogger("gender_plot")


[docs]def main(argString=None): """The main function of the module. :param argString: the options. :type argString: list These are the steps: 1. Prints the options. 2. If there are ``summarized_intensities`` provided, reads the files (:py:func:`read_summarized_intensities`) and skips to step 7. 3. Reads the ``bim`` file to get markers on the sexual chromosomes (:py:func:`read_bim`). 4. Reads the ``fam`` file to get gender (:py:func:`read_fam`). 5. Reads the file containing samples with sex problems (:py:func:`read_sex_problems`). 6. Reads the intensities and summarizes them (:py:func:`read_intensities`). 7. Plots the summarized intensities (:py:func:`plot_gender`). """ # Getting and checking the options args = parseArgs(argString) checkArgs(args) data = None if args.summarized_intensities is None: # Reading the BIM file marker_names_chr = read_bim(args.bfile + ".bim") # Reading the FAM file sample_names_gender = read_fam(args.bfile + ".fam") # Reading the sex problem file sex_problems = read_sex_problems(args.sex_problems) # Reading the intensity file data = read_intensities(args.intensities, marker_names_chr, sample_names_gender, sex_problems) else: data = read_summarized_intensities(args.summarized_intensities) # Plot the gender plot_gender(data, args)
[docs]def read_sex_problems(file_name): """Reads the sex problem file. :param file_name: the name of the file containing sex problems. :type file_name: str :returns: a :py:class:`frozenset` containing samples with sex problem. If there is no ``file_name`` (*i.e.* is ``None``), then an empty :py:class:`frozenset` is returned. """ if file_name is None: return frozenset() problems = None with open(file_name, 'r') as input_file: header_index = dict([ (col_name, i) for i, col_name in enumerate(input_file.readline().rstrip("\r\n").split("\t")) ]) if "IID" not in header_index: msg = "{}: no column named IID".format(file_name) raise ProgramError(msg) problems = frozenset([ i.rstrip("\r\n").split("\t")[header_index["IID"]] for i in input_file.readlines() ]) return problems
[docs]def encode_chr(chromosome): """Encodes chromosomes. :param chromosome: the chromosome to encode. :type chromosome: str :returns: the encoded chromosome as :py:class:`int`. It changes ``X``, ``Y``, ``XY`` and ``MT`` to ``23``, ``24``, ``25`` and ``26``, respectively. It changes everything else as :py:class:`int`. If :py:class:`ValueError` is raised, then :py:class:`ProgramError` is also raised. If a chromosome as a value below 1 or above 26, a :py:class:`ProgramError` is raised. .. testsetup:: from pyGenClean.SexCheck.gender_plot import encode_chr .. doctest:: >>> [encode_chr(str(i)) for i in range(0, 11)] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] >>> [encode_chr(str(i)) for i in range(11, 21)] [11, 12, 13, 14, 15, 16, 17, 18, 19, 20] >>> [encode_chr(str(i)) for i in range(21, 27)] [21, 22, 23, 24, 25, 26] >>> [encode_chr(i) for i in ["X", "Y", "XY", "MT"]] [23, 24, 25, 26] >>> encode_chr("27") Traceback (most recent call last): ... ProgramError: 27: invalid chromosome >>> encode_chr("XX") Traceback (most recent call last): ... ProgramError: XX: invalid chromosome """ if chromosome == "X": return 23 if chromosome == "Y": return 24 if chromosome == "XY": return 25 if chromosome == "MT": return 26 try: new_chromosome = int(chromosome) if new_chromosome < 0 or new_chromosome > 26: msg = "{}: invalid chromosome".format(chromosome) raise ProgramError(msg) return new_chromosome except ValueError: msg = "{}: invalid chromosome".format(chromosome) raise ProgramError(msg)
[docs]def encode_gender(gender): """Encodes the gender. :param gender: the gender to encode. :type gender: str :returns: the encoded gender. It changes ``1`` and ``2`` to ``Male`` and ``Female`` respectively. It encodes everything else to ``Unknown``. .. testsetup:: from pyGenClean.SexCheck.gender_plot import encode_gender .. doctest:: >>> encode_gender("1") 'Male' >>> encode_gender("2") 'Female' >>> encode_gender("0") 'Unknown' >>> encode_gender("This is not a gender code") 'Unknown' """ if gender == "1": return "Male" if gender == "2": return "Female" return "Unknown"
[docs]def read_bim(file_name): """Reads the BIM file to gather marker names. :param file_name: the name of the ``bim`` file. :type file_name: str :returns: a :py:class:`dict` containing the chromosomal location of each marker on the sexual chromosomes. It uses the :py:func:`encode_chr` to encode the chromosomes from ``X`` and ``Y`` to ``23`` and ``24``, respectively. """ marker_names_chr = None with open(file_name, 'r') as input_file: marker_names_chr = dict([ (i[1], encode_chr(i[0])) for i in [ j.rstrip("\r\n").split("\t") for j in input_file.readlines() ] if encode_chr(i[0]) in {23, 24} ]) return marker_names_chr
[docs]def read_fam(file_name): """Reads the FAM file to gather sample names. :param file_name: the ``fam`` file to read. :type file_name: str :returns: a :py:class:`dict` containing the gender of each samples. It uses the :py:func:`encode_gender` to encode the gender from ``1``and ``2`` to ``Male`` and ``Female``, respectively. """ sample_names_gender = None with open(file_name, 'r') as input_file: sample_names_gender = dict([ (i[1], encode_gender(i[4])) for i in [ j.rstrip("\r\n").split(" ") for j in input_file.readlines() ] ]) return sample_names_gender
[docs]def plot_gender(data, options): """Plots the gender. :param data: the data to plot. :param options: the options. :type data: numpy.recarray :type options: argparse.Namespace Plots the summarized intensities of the markers on the Y chromosomes in function of the markers on the X chromosomes, with problematic samples with different colors. Also uses :py:func:`print_data_to_file` to save the data, so that it is faster to rerun the analysis. """ if data is None: # there is a problem... msg = ("no data: specify either '--bfile' and '--intensities', or " "'--summarized-intensities'") raise ProgramError(msg) import matplotlib as mpl if options.format != "X11" and mpl.get_backend() != "agg": mpl.use("Agg") import matplotlib.pyplot as plt if options.format != "X11": plt.ioff() # The figure and axes fig = plt.figure() fig.subplots_adjust(top=0.84) ax = fig.add_subplot(111) # Changing the spines ax.xaxis.set_ticks_position("bottom") ax.yaxis.set_ticks_position("left") ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) # Setting the title ax.set_xlabel(options.xlabel) ax.set_ylabel(options.ylabel) # For the legend plot_object = [] labels = [] # Plotting the OK males males = np.logical_and(data["gender"] == "Male", data["status"] == "OK") tmp, = ax.plot(data["chr23"][males], data["chr24"][males], "o", ms=5, mec="#0099CC", mfc="#0099CC") plot_object.append(tmp) labels.append("OK Males (n={})".format(sum(males))) if options.summarized_intensities is None: print_data_to_file(data[males], "{}.ok_males.txt".format(options.out)) # Plotting the OK females females = np.logical_and(data["gender"] == "Female", data["status"] == "OK") tmp, = ax.plot(data["chr23"][females], data["chr24"][females], "o", ms=5, mec="#CC0000", mfc="#CC0000") plot_object.append(tmp) labels.append("OK Females (n={})".format(sum(females))) if options.summarized_intensities is None: print_data_to_file(data[females], "{}.ok_females.txt".format(options.out)) # Plotting the OK unknowns unknowns = np.logical_and(data["gender"] == "Unknown", data["status"] == "OK") tmp, = ax.plot(data["chr23"][unknowns], data["chr24"][unknowns], "o", ms=5, mec="#555555", mfc="#555555") plot_object.append(tmp) labels.append("OK Unknowns (n={})".format(sum(unknowns))) if options.summarized_intensities is None: print_data_to_file(data[unknowns], "{}.ok_unknowns.txt".format(options.out)) # Plotting the Problem males males = np.logical_and(data["gender"] == "Male", data["status"] == "Problem") tmp, = ax.plot(data["chr23"][males], data["chr24"][males], "^", ms=6, mec="#000000", mfc="#669900") plot_object.append(tmp) labels.append("Problematic Males (n={})".format(sum(males))) if options.summarized_intensities is None: print_data_to_file(data[males], "{}.problematic_males.txt".format(options.out)) # Plotting the Problem females females = np.logical_and(data["gender"] == "Female", data["status"] == "Problem") tmp, = ax.plot(data["chr23"][females], data["chr24"][females], "v", ms=6, mec="#000000", mfc="#9933CC") plot_object.append(tmp) labels.append("Problematic Females (n={})".format(sum(females))) if options.summarized_intensities is None: print_data_to_file(data[females], "{}.problematic_females.txt".format(options.out)) # Plotting the Problem unknowns unknowns = np.logical_and(data["gender"] == "Unknown", data["status"] == "Problem") tmp, = ax.plot(data["chr23"][unknowns], data["chr24"][unknowns], ">", ms=6, mec="#000000", mfc="#555555") plot_object.append(tmp) labels.append("Problematic Unknown (n={})".format(sum(unknowns))) if options.summarized_intensities is None: print_data_to_file(data[unknowns], "{}.problematic_unknowns.txt".format(options.out)) # the legend prop = mpl.font_manager.FontProperties(size=10) leg = ax.legend(plot_object, labels, loc=8, numpoints=1, fancybox=True, prop=prop, ncol=2, bbox_to_anchor=(0., 1.02, 1., .102), borderaxespad=0.) # Setting the limit xlim = ax.get_xlim() ax.set_xlim((xlim[0]-0.01, xlim[1]+0.01)) ylim = ax.get_ylim() ax.set_ylim((ylim[0]-0.01, ylim[1]+0.01)) if options.format == "X11": plt.show() else: file_name = "{}.{}".format(options.out, options.format) try: plt.savefig(file_name) except IOError: msg = "{}: can't write file".format(file_name) raise ProgramError(msg)
[docs]def read_summarized_intensities(prefix): """Reads the summarized intensities from 6 files. :param prefix: the prefix of the six files. :type prefix: str :returns: a :py:class`numpy.recarray` containing the following columns (for each of the samples): ``sampleID``, ``chr23``, ``chr24``, ``gender`` and ``status``. Instead of reading a final report (like :py:func:`read_intensities`), this function reads six files previously created by this module to gather sample information. Here are the content of the six files: * ``prefix.ok_females.txt``: information about females without sex problem. * ``prefix.ok_males.txt``: information about males without sex problem. * ``prefix.ok_unknowns.txt``: information about unknown gender without sex problem. * ``prefix.problematic_females.txt``: information about females with sex problem. * ``prefix.problematic_males.txt``: information about males with sex problem. * ``prefix.problematic_unknowns.txt``: information about unknown gender with sex problem. Each file contains the following columns: ``sampleID``, ``chr23``, ``chr24``, ``gender`` and ``status``. The final data set contains the following information for each sample: * ``sampleID``: the sample ID. * ``chr23``: the summarized intensities for chromosome 23. * ``chr24``: the summarized intensities for chromosome 24. * ``gender``: the gender of the sample (``Male`` or ``Female``). * ``status``: the status of the sample (``OK`` or ``Problem``). The summarized intensities for a chromosome (:math:`S_{chr}`) is computed using this formula (where :math:`I_{chr}` is the set of all marker intensities on chromosome :math:`chr`): .. math:: S_{chr} = \\frac{\\sum{I_{chr}}}{||I_{chr}||} """ data = [] for suffix in {".ok_females.txt", ".ok_males.txt", ".ok_unknowns.txt", ".problematic_females.txt", ".problematic_males.txt", ".problematic_unknowns.txt"}: with open(prefix + suffix, 'r') as input_file: header_index = None for line_nb, line in enumerate(input_file): row = line.rstrip("\r\n").split("\t") if line_nb == 0: # This is the header header_index = dict([ (col_name, i) for i, col_name in enumerate(row) ]) for col_name in {"sampleID", "chr23", "chr24", "gender", "status"}: if col_name not in header_index: msg = "{}: no column named {}".format( prefix+suffix, col_name, ) raise ProgramError(msg) else: sampleID = row[header_index["sampleID"]] chr23 = row[header_index["chr23"]] chr24 = row[header_index["chr24"]] gender = row[header_index["gender"]] status = row[header_index["status"]] try: chr23 = float(chr23) chr24 = float(chr24) except ValueError: msg = ("{} and {}: bad summarized intensities for " "sample {}".format(chr23, chr24, sampleID)) raise ProgramError(msg) data.append((sampleID, chr23, chr24, gender, status)) # Creating the data structure data = np.array( data, dtype=[("sampleID", "a{}".format(max([len(i[0]) for i in data]))), ("chr23", float), ("chr24", float), ("gender", "a{}".format(max([len(i[3]) for i in data]))), ("status", "a{}".format(max([len(i[4]) for i in data])))], ) return data
[docs]def read_intensities(file_name, needed_markers_chr, needed_samples_gender, sex_problems): """Reads the intensities from a file. :param file_name: the name of the input file. :param needed_markers_chr: the markers that are needed. :param needed_samples_gender: the gender of all the samples. :param sex_problems: the sample with sex problem. :type file_name: str :type needed_markers_chr: dict :type needed_samples_gender: dict :type sex_problems: frozenset :returns: a :py:class`numpy.recarray` containing the following columns (for each of the samples): ``sampleID``, ``chr23``, ``chr24``, ``gender`` and ``status``. Reads the normalized intensities from a final report. The file must contain the following columns: ``SNP Name``, ``Sample ID``, ``X``, ``Y`` and ``Chr``. It then keeps only the required markers (those that are on sexual chromosomes (``23`` or ``24``), encoding `NaN` intensities to zero. The final data set contains the following information for each sample: * ``sampleID``: the sample ID. * ``chr23``: the summarized intensities for chromosome 23. * ``chr24``: the summarized intensities for chromosome 24. * ``gender``: the gender of the sample (``Male`` or ``Female``). * ``status``: the status of the sample (``OK`` or ``Problem``). The summarized intensities for a chromosome (:math:`S_{chr}`) is computed using this formula (where :math:`I_{chr}` is the set of all marker intensities on chromosome :math:`chr`): .. math:: S_{chr} = \\frac{\\sum{I_{chr}}}{||I_{chr}||} """ input_file = None if file_name.endswith(".gz"): input_file = gzip.open(file_name, 'rb') else: input_file = open(file_name, 'r') # The intensities intensities = {} header_index = None for line_nb, line in enumerate(input_file): row = line.rstrip("\r\n").split("\t") if line_nb == 0: # This is the header header_index = dict([ (col_name, i) for i, col_name in enumerate(row) ]) # Check the column names for col_name in {"SNP Name", "Sample ID", "X", "Y", "Chr"}: if col_name not in header_index: msg = "{}: no column named {}".format(file_name, col_name) raise ProgramError(msg) else: # This is the data # Check if we want this sample and this marker sampleID = row[header_index["Sample ID"]] markerID = row[header_index["SNP Name"]] chromosome = encode_chr(row[header_index["Chr"]]) if chromosome not in {23, 24}: # Not good chromsoome continue if sampleID not in needed_samples_gender: # Sample not needed continue if markerID not in needed_markers_chr: # Marker not needed continue if sampleID not in intensities: # First time we see this sample intensities[sampleID] = [0, 0, 0, 0] # We get the intensities allele_a = row[header_index["X"]] allele_b = row[header_index["Y"]] # Check for NaN if allele_a == "NaN" or allele_b == "NaN": continue try: allele_a = float(allele_a) allele_b = float(allele_b) except ValueError: msg = "{}: {} and {}: wrong intensities".format(file_name, allele_a, allele_b) raise ProgramError(msg) if chromosome == 23: # Chromosome 23 intensities[sampleID][0] += allele_a + allele_b intensities[sampleID][1] += 1 else: # Chromosome 24 intensities[sampleID][2] += allele_a + allele_b intensities[sampleID][3] += 1 # Closing the input file input_file.close() # Creating the data structure data = [] for sampleID in intensities.iterkeys(): sum_chr_23, nb_chr_23, sum_chr_24, nb_chr_24 = intensities[sampleID] status = "OK" if sampleID in sex_problems: status = "Problem" try: data.append((sampleID, sum_chr_23 / nb_chr_23, sum_chr_24 / nb_chr_24, needed_samples_gender[sampleID], status)) except ZeroDivisionError: msg = "0 marker on chr23 or chr24" raise ProgramError(msg) data = np.array( data, dtype=[("sampleID", "a{}".format(max([len(i[0]) for i in data]))), ("chr23", float), ("chr24", float), ("gender", "a{}".format(max([len(i[3]) for i in data]))), ("status", "a{}".format(max([len(i[4]) for i in data])))], ) return data
[docs]def checkArgs(args): """Checks the arguments and options. :param args: an object containing the options of the program. :type args: argparse.Namespace :returns: ``True`` if everything was OK. If there is a problem with an option, an exception is raised using the :py:class:`ProgramError` class, a message is printed to the :class:`sys.stderr` and the program exists with code 1. """ # Checking if there are input files if args.summarized_intensities is None: if args.bfile is None and args.intensities is None: msg = ("need to specify either '--bfile' and '--intensities', or " "'--summarized-intensities'") raise ProgramError(msg) # Checking the fam file bim for suffix in {".bim", ".fam"}: if not os.path.isfile(args.bfile + suffix): msg = "{}: no such file".format(args.bfile + suffix) raise ProgramError(msg) # Checking the intensity file if not os.path.isfile(args.intensities): msg = "{}: no such file".format(args.intensities) raise ProgramError(msg) else: if args.bfile is not None or args.intensities is not None: msg = ("need to specify either '--bfile' and '--intensities', or " "'--summarized-intensities'") raise ProgramError(msg) for suffix in {".ok_females.txt", ".ok_males.txt", ".ok_unknowns.txt", ".problematic_females.txt", ".problematic_males.txt", ".problematic_unknowns.txt"}: if not os.path.isfile(args.summarized_intensities + suffix): msg = "{}: no such file".format(args.summarized_intensities + suffix) raise ProgramError(msg) # Checking the sex problem file if args.sex_problems is not None: if not os.path.isfile(args.sex_problems): msg = "{}: no such file".format(args.sex_problems) raise ProgramError(msg) return True
[docs]def parseArgs(argString=None): # pragma: no cover """Parses the command line options and arguments. :param argString: the options. :type argString: list :returns: A :py:class:`argparse.Namespace` object created by the :py:mod:`argparse` module. It contains the values of the different options. ============================ ====== ======================================= Options Type Description ============================ ====== ======================================= ``--bfile`` string The plink binary file containing information about markers and individuals. ``--intensities`` string A file containing alleles intensities for each of the markers located on the X and Y chromosome. ``--summarized-intensities`` string The prefix of six files containing summarized chr23 and chr24 intensities. ``--sex-problems`` string The file containing individuals with sex problems. ``--format`` string The output file format (png, ps, pdf, or X11). ``--xlabel`` string The label of the X axis. ``--ylabel`` string The label of the Y axis. ``--out`` string The prefix of the output files. ============================ ====== ======================================= .. note:: No option check is done here (except for the one automatically done by argparse). Those need to be done elsewhere (see :py:func:`checkArgs`). """ args = None if argString is None: args = parser.parse_args() else: args = parser.parse_args(argString) return args
[docs]class ProgramError(Exception): """An :py:class:`Exception` raised in case of a problem. :param msg: the message to print to the user before exiting. :type msg: str """ def __init__(self, msg): """Construction of the :py:class:`ProgramError` class. :param msg: the message to print to the user :type msg: str """ self.message = str(msg) def __str__(self): return self.message
# The parser object desc = "Plots the gender using X and Y chromosomes' intensities" parser = argparse.ArgumentParser(description=desc) parser.add_argument("-v", "--version", action="version", version="pyGenClean version {}".format(__version__)) # The INPUT files group = parser.add_argument_group("Input File") group.add_argument("--bfile", type=str, metavar="FILE", required=True, help=("The plink binary file containing information " "about markers and individuals. Must be specified " "if '--summarized-intensities' is not.")) group.add_argument("--intensities", type=str, metavar="FILE", help=("A file containing alleles intensities for each " "of the markers located on the X and Y " "chromosome. Must be specified if " "'--summarized-intensities' is not.")) group.add_argument("--summarized-intensities", type=str, metavar="FILE", help=("The prefix of six files (prefix.ok_females.txt, " "prefix.ok_males.txt, prefix.ok_unknowns.txt, " "problematic_females.txt, problematic_males.txt " "and problematic_unknowns.txt) containing " "summarized chr23 and chr24 intensities. Must be " "specified if '--bfile' and '--intensities' are " "not.")) group.add_argument("--sex-problems", type=str, metavar="FILE", required=True, help=("The file containing individuals with sex " "problems. This file is not read if the option " "'summarized-intensities' is used.")) # The options group = parser.add_argument_group("Options") group.add_argument("--format", type=str, metavar="FORMAT", default="png", choices=["png", "ps", "pdf", "X11"], help=("The output file format (png, ps, pdf, or X11 " "formats are available). [default: %(default)s]")) group.add_argument("--xlabel", type=str, metavar="STRING", default="X intensity", help="The label of the X axis. [default: %(default)s]") group.add_argument("--ylabel", type=str, metavar="STRING", default="Y intensity", help="The label of the Y axis. [default: %(default)s]") # The OUTPUT files group = parser.add_argument_group("Output File") group.add_argument("--out", type=str, metavar="FILE", default="sexcheck", help=("The prefix of the output files (which will be " "a Plink binary file. [default: %(default)s]"))
[docs]def safe_main(): """A safe version of the main function (that catches ProgramError).""" try: main() except KeyboardInterrupt: logger.info("Cancelled by user") sys.exit(0) except ProgramError as e: logger.error(e.message) parser.error(e.message)
if __name__ == "__main__": safe_main()