Configuration Files

Two default configuration files are available to run the proposed protocol. Before using them, be sure to follow the Preprocessing Steps described in the Proposed Protocol section.

Note that lines starting with a # are comments, and are not used by the pipeline. The default parameters were commented out, and could be uncommented to change their values.

First Configuration File

This file should be use with the original dataset as input. Only change the loop-assoc file name in the plate_bias section ([8]) and the reference population files (ceu-bfile, yri-bfile and jpt-chb-bfile in the check_ethnicity section ([10]). Those last three datasets are provided and can be downloaded at http://www.statgen.org.

If you want to generate the gender and BAF and LRR plots, you will require to provide the intensities (sex-chr-intensities and lrr-baf-raw-dir in the sex_check section ([7]) after uncommenting the required options).

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# This is the first part of example configuration files for performing efficient
# data clean up. All commented out parameters are those that are used by
# default.


[1]
# ##############################################################################
# Checks sample contamination using the bafRegress tool
# (http://genome.sph.umich.edu/wiki/BAFRegress). Field name can be modify using
# options (as describe below).
# ##############################################################################

script = contamination
raw-dir = /PATH/TO/DIRECTORY/CONTAINING/INTENSITIES.txt
# colsample = Sample Name
# colmarker = SNP Name
# colbaf = B Allele Freq
# colab1 = Allele1 - AB
# colab2 = Allele2 - AB
# sge
# sge-walltime = WRITE WALLTIME ONLY IF REQUIRED
# sge-nodes = WRITE NB NODES AND NB PROCESSOR PER NODE ONLY IF REQUIRED
# sample-per-run-for-sge = 30



[2]
# ##############################################################################
# Checks missing rate and pairwise concordance of duplicated samples. Duplicated
# samples should have same family and individual identification numbers. The
# names can be modified directly in the transposed pedfile.
# ##############################################################################

script = duplicated_samples
# sample-completion-threshold = 0.9
# sample-concordance-threshold = 0.97



[3]
# ##############################################################################
# Checks missing rate and pairwise concordance of duplicated markers. Duplicated
# markers are found by looking at their chromosomal position. No modification of
# the transposed bedfile is required.
# ##############################################################################

script = duplicated_snps
# snp-completion-threshold = 0.9
# snp-concordance-threshold = 0.98
# frequency_difference = 0.05



[4]
# ##############################################################################
# Finds and removes markers which have a missing rate of 100% or markers (not
# located on mitochondrial chromosome) that have a heterozygosity rate of 0%.
# ##############################################################################

script = noCall_hetero_snps



[5]
# ##############################################################################
# Removes sample with a missing rate higher than a user defined threshold. For
# this step, we recommend using a threshold of 10% missing rate as samples with
# a missing rate of 2% will be later removed.
# ##############################################################################

script = sample_missingness
# mind = 0.1



[6]
# ##############################################################################
# Removes markers with a missing rate higher than a user defined threshold. For
# this step, we recommend using a threshold of 2% missing rate.
# ##############################################################################

script = snp_missingness
# geno = 0.02



[7]
# ##############################################################################
# Removes sample with a missing rate higher than a user defined threshold. For
# this step, we recommend using a threshold of 2% missing rate.
# ##############################################################################

script = sample_missingness
mind = 0.02



[8]
# ##############################################################################
# Using PLINK, finds samples with gender issues, according to heterozygosity
# rate on the X chromosome. If you want to produce a gender plot, you need to
# uncomment the "gender-plot" option and provide a file containing marker
# intensities on the X and Y chromosomes. If you want to produce a BAF and LRR
# plot, you need to uncomment the "lrr-baf" option and provide a directory
# containing the BAF and LRR values of each marker on the X and Y chromosomes
# (one file per sample).
# ##############################################################################

script = sex_check
# femaleF = 0.3
# maleF = 0.7
# nbChr23 = 50
# gender-plot
# sex-chr-intensities = /PATH/TO/FILE/CONTAINING/INTENSITIES_FILE.txt
# gender-plot-format = png
# lrr-baf
# lrr-baf-raw-dir = /PATH/TO/DIRECTORY/CONTAINING/BAF_LRR_FILES.txt
# lrr-baf-format = png
# lrr-baf-dpi = 300



[9]
# ##############################################################################
# Using PLINK, performs a plate bias analysis, using a p value threshold of
# 1.0e-7.
# ##############################################################################

script = plate_bias
loop-assoc = /PATH/TO/FILE/CONTAINING/PLATE_INFORMATION.txt
# pfilter = 1.0e-07



[10]
# ##############################################################################
# Checks for related individual and randomly keeps one of each related group. If
# you have a server with a DRMAA-compliant distributed resource management
# system, you can uncomment the "sge" and the "line-per-file-for-sge" options,
# to run this step in parallel.
# ##############################################################################

script = find_related_samples
# min-nb-snp = 10000
# indep-pairwise = 50 5 0.1
# maf = 0.05
# ibs2-ratio = 0.8
# sge
# line-per-file-for-sge = 100
# sge-walltime = WRITE WALLTIME ONLY IF REQUIRED
# sge-nodes = WRITE NB NODES AND NB PROCESSOR PER NODE ONLY IF REQUIRED



[11]
# ##############################################################################
# Using PLINK, computes the MDS value of each sample, and using three reference
# populations (CEU, YRI and JPT-CHB), finds outliers of one of those three
# reference population. You might want to skip the reference population using
# the "skip-ref-pops" option. You might need to change the "multiplier" option
# to be more or less stringent, according to you dataset. If you have a server
# with a DRMAA-compliant distributed resource management system, you can
# uncomment the "sge" and the "line-per-file-for-sge" options, to run this step
# in parallel.
# ##############################################################################

script = check_ethnicity
ceu-bfile = /PATH/TO/PLINK/BINARY/FILE/FOR/CEU_population
yri-bfile = /PATH/TO/PLINK/BINARY/FILE/FOR/YRI_population
jpt-chb-bfile = /PATH/TO/PLINK/BINARY/FILE/FOR/JPT-CHB_population
# skip-ref-pops
# min-nb-snp = 8000
# indep-pairwise = 50 5 0.1
# maf = 0.05
# sge
# line-per-file-for-sge = 100
# nb-components = 10
# outliers-of = CEU
# multiplier = 1.9
# xaxis = C1
# yaxis = C2
# format = png
# title = "C2 in function of C1 - MDS"
# xlabel = C1
# ylabel = C2
# create-scree-plot
# scree-plot-title "TITLE OF THE PLOT"
# sge-walltime = WRITE WALLTIME ONLY IF REQUIRED
# sge-nodes = WRITE NB NODES AND NB PROCESSOR PER NODE ONLY IF REQUIRED
# ibs-sge-walltime = WRITE WALLTIME ONLY IF REQUIRED
# ibs-sge-nodes = WRITE NB NODES AND NB PROCESSOR PER NODE ONLY IF REQUIRED

Second Configuration File

This configuration file should be run after the First Configuration File and with the output of the second sample missingness section ([6] in the First Configuration File).

A file containing the samples and markers to be removed should be created using the output of the sex_check, find_related_samples, check_ethnicity and plate_bias sections of the First Configuration File.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# This is the second part of example configuration files for performing
# efficient data clean up. All commented out parameters are those that are used
# by default.

# The input file should be the output file of the second sample missigness step
# (which is the one that has been used by any of these scripts):
#    - sex_check
#    - find_related_samples
#    - check_ethnicity
#    - plate_bias

# Note that the final usable dataset is the one located in the directory where
# "remove_heterozygous_haploid" was run (which is the one that has been used by
# any of these scripts):
#    - flag_maf_zero
#    - flag_hw
# Hence, if you want to remove the flagged markers, you should use
# pyGenClean_subset_data on markers in the "flag_maf_zero" and "flag_hw"
# directories using the PLINK's binary file located in
# "remove_heterozygous_haploid".

[12]
# ##############################################################################
# After manually checking that everything went fine in the previous steps, you
# need to create a list of samples to remove from steps [7] to [10] and a list
# of markers to exclude from steps [6]. Just create a file containing family and
# individual identification numbers for all those samples to remove. Note that
# the two options 'reason-marker' and 'reason-sample' are for the automatic
# report generated after the analysis.
# ##############################################################################

script = subset
reason-marker = reason for marker exclusion
reason-sample = reason for sample exclusion
remove = /PATH/TO/FILE/CONTAINING/ALL_SAMPLES_FROM_PREVIOUS_STEPS_TO_REMOVE.txt
exclude = /PATH/TO/FILE/CONTAINING/ALL_MARKERS_FROM_PREVIOUS_STEPS_TO_EXCLUDE.txt



[13]
# ##############################################################################
# Removes heterozygous haploid genotypes from the dataset.
# ##############################################################################

script = remove_heterozygous_haploid



[14]
# ##############################################################################
# Flags uninformative markers (with a MAF of 0). This step only flag markers.
# You might want to exclude them later on.
# ##############################################################################

script = flag_maf_zero



[15]
# ##############################################################################
# Flags markers that fail HWE test for a p value of 1e-4 and after Bonferroni
# correction. This step only flag markers. You might want to exclude them later
# on.
# ##############################################################################

script = flag_hw
# hwe = 1e-4