-
Notifications
You must be signed in to change notification settings - Fork 66
/
HisMaker.hh
252 lines (231 loc) · 10.1 KB
/
HisMaker.hh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
#ifndef __HISMAKER_HH__
#define __HISMAKER_HH__
// C/C++ includes
#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <map>
#include <vector>
using namespace std;
// ROOT includes
#include <TFrame.h>
#include <TKey.h>
#include <TApplication.h>
#include <TROOT.h>
#include <TTimer.h>
#include <Getline.h>
#include <TFile.h>
#include <TTree.h>
#include <TH1D.h>
#include <TH2D.h>
#include <TH3D.h>
#include <TMath.h>
#include <TF1.h>
#include <TLine.h>
#include <TCanvas.h>
#include <TStyle.h>
#include <Math/DistFunc.h>
#include <TPRegexp.h>
#include <THashTable.h>
#include <TGraph.h>
// Application includes
#include "AliParser.hh"
#include "Genome.hh"
#include "IO.hh"
// Constants
const static double PRECISION = 0.01;
const static double GENOME_SIZE = 2.9e+9; // Excluding gaps
const static double GENOME_CNV_FRACTION = 0.01;
const static double GENOME_SIZE_NORMAL = GENOME_SIZE*(1 - GENOME_CNV_FRACTION);
const static double GENOME_SIZE_CNV = GENOME_SIZE*GENOME_CNV_FRACTION;
const static double CUTOFF_REGION = 0.05;
const static double CUTOFF_TWO_REGIONS = 0.01;
class HisMaker
{
private:
static const int N_SQRT = 100000,N_FUNC = 100000,N_INV = 10000;
private:
int bin_size; // Bin size
double inv_bin_size; // Invert bin_size
TString root_file_name; // Name of root file with data
TString dir_name; // Directory in the root file
TH1 *rd_level,*rd_level_merge,*frag_len,*dl,*dl2; // Stats on partitioning
int chromosome_len;
bool useMappability;
double *sqrt_vals,*inv_vals;
TF1 **tfuncs;
TH1 *gen_his_signal,*gen_his_distr,*gen_his_distr_all; // His for genotyping
double _mean, _sigma; // Mean and sigma of gen_his_distr
double _mean_all,_sigma_all; // Mean and sigma of gen_his_distr_all
TCanvas *canv_view; // Canvas for displaying
Genome *refGenome_;
string dir_;
string fastafile;
IO io;
public:
HisMaker(string rootFile,Genome *genome = NULL);
HisMaker(string rootFile,int binSize,bool useGCcorrected,
Genome *genome = NULL);
~HisMaker();
// Input/output
private:
void writeTreeForChromosome(string chrom,short *arr_p,short *arr_u,int len);
bool readTreeForChromosome(TString fileName,
string chrom,short *arr_p,short *arr_u);
void writeATTreeForChromosome(string chrom,int *arr,int n);
bool writeHistograms(TH1 *his1 = NULL,TH1 *his2 = NULL,
TH1 *his3 = NULL,TH1 *his4 = NULL,
TH1 *his5 = NULL,TH1 *his6 = NULL)
{
return writeH(false,his1,his2,his3,his4,his5,his5);
}
bool writeHistogramsToBinDir(TH1 *his1 = NULL,TH1 *his2 = NULL,
TH1 *his3 = NULL,TH1 *his4 = NULL,
TH1 *his5 = NULL,TH1 *his6 = NULL)
{
return writeH(true,his1,his2,his3,his4,his5,his6);
}
bool writeH(bool useDir,
TH1 *his1,TH1 *his2,TH1 *his3,
TH1 *his4,TH1 *his5,TH1 *his6);
void writeTreeForVcf(string chr,std::vector<int> &vcf_position,std::vector<char> &vcf_ref,
std::vector<char> &vcf_alt, std::vector<short> &vcf_qual,
std::vector<short> &vcf_nref, std::vector<short> &vcf_nalt,
std::vector<short> &vcf_gt, std::vector<short> &vcf_flag);
void updateTreeIdVar(string chr,std::vector<int> &vcf_position, std::map<int,char> &vcf_refm, std::map<int,char> &vcf_altm);
public:
TH1* getHistogram(TString name,TString alt_name = "");
TH1* getHistogram(TString root_file,TString dir,
TString name,TString alt_name = "");
// Histogram naming
private:
TString rd_gc_name,rd_gc_xy_name;
TString rd_gc_GC_name,rd_gc_xy_GC_name;
public:
inline void setDataDir(string dir) { dir_ = dir; }
inline void setFastaFile(string ff) { fastafile = ff; }
TString getDirName(int bin);
TString getDistrName(string chr,int bin,bool useATcoor,bool useGCcorr);
TString getUDistrName(string chr,int bin);
TString getRawSignalName(TString chr,int bin);
TString getSignalName(TString chr,int bin,bool useATcoor,bool useGCcorr);
TString getUSignalName(TString chrom,int bin);
TString getAIBName(TString chr,int bin);
TString getPartitionName(TString chr,int bin,bool useATcorr,bool useGCcorr);
TString getGCName(TString chrom,int bin);
TString getATaggrName() { return "his_at_aggr"; }
int readChromosome(string chrom,char *seq,int max_len);
int parseGCandAT(char *seq,int len,int **address,TH1 *his = NULL);
// Making trees, histograms, parititoning, PE support
public:
void produceTrees(string *user_chroms,int n_chroms,
string *user_files,int n_files,
bool lite);
void produceTreesFrom1BAM(string *user_chroms,int n_chroms,
string user_files,bool lite);
void addVcf(string *user_chroms,int n_chroms,
string *user_files,int n_files,bool rmchr,bool addchr);
void IdVar(string *user_chroms,int n_chroms,
string *user_files,int n_files,bool rmchr,bool addchr);
void MaskVar(string *user_chroms,int n_chroms,
string *user_files,int n_files,bool rmchr,bool addchr);
double beta_pdf(double x, int a, int b);
void produceBAF(string *chrom,int n_chroms,bool useGCcorr,
bool useHaplotype,bool useid,bool usemask,int res=201);
void producePartitionBAF(string *chrom,int n_chroms,bool useGCcorr,
bool useHaplotype,bool useid,bool usemask,int res=201);
void callBAF(string *chrom,int n_chroms,bool useGCcorr,
bool useHaplotype,bool useid,bool usemask);
void mergeTrees(string *user_chroms,int n_chroms,
string *user_files,int n_files);
void produceHistograms(string *chrom,int n_chroms,
string *root_files,int n_root_files,
bool useGCcorr = false);
void produceHistogramsFa(string *chrom,int n_chroms,
string *root_files,int n_root_files,
bool useGCcorr = false);
void produceHistograms_try_correct(string *user_chroms,int n_chroms);
void produceHistogramsNew(string *user_chroms,int n_chroms);
void aggregate(string *files,int n_files,string *chrom,int n_chroms);
TTree *fitValley2ATbias(TH1 *his_read,TH1 *his_frg);
void stat(string *user_chroms,int n_chroms,bool useATcorr);
void eval(string *files,int n_files,bool useATcorr,bool useGCcorr);
void partition(string *user_chroms,int n_chroms,
bool skipMasked,bool useATcorr,bool useGCcorr,
bool exome = false,int range = 128);
void partition2D(string *user_chroms,int n_chroms,
bool skipMasked,bool useATcorr,bool useGCcorr,
bool exome = false,int range = 128);
void partitionSignal(int bin, string signal, unsigned int flags, string *user_chroms,int n_chroms,bool skipMasked,bool exome,int range);
void partitionSignal2D(int bin, string signal1, string signal2, unsigned int flags, string *user_chroms,int n_chroms,int range);
void callSVs(string *user_chroms,int n_chroms,bool useATcorr,bool useGCcorr,
double delta);
void callSVsSignal(int bin, string signal, unsigned int flags,string *user_chroms,int n_chroms,double delta);
void pe(string *bamss,int n_files,double over,double qual);
void pe_for_file(string file,
string *bams,int n_files,double over,double qual);
private:
double estimateDepthMaximum(string *user_chroms,int n_chroms);
int extract_pe(TString input,
string *bams,int n_bams,double over,double qual,
bool do_print = true,int *ses = NULL);
private:
bool correctGC(TH1 *his,TH1 *his_gc,TH2* his_rd_gc,TH1 *his_mean);
bool correctGCbyFragment(TH1 *his,TH1 *his_gc,TH2* his_rd_gc,TH1 *his_mean);
void calcGCcorrection(TH2* his_rd_gc,TH1 *his_mean,double *corr,int n);
void updateMask(double *rd,double *level,bool *mask,int n_bins,
double mean,double sigma);
void updateMask_skip(double *rd,double *level,bool *mask,int n_bins,
double mean,double sigma);
void calcLevels(double *level1,double *isig1,bool *mask,int n_bins,
int bin_band,bool skipMasked,
double *level2 = NULL,double *isig2 = NULL);
bool mergeLevels(double *level,int n_bins,double delta);
public: // Viewing and genotyping
void view(string *files,int n_files,bool useATcorr,bool useGCcorr);
void genotype(string *files,int n_files,bool useATcorr,bool useGCcorr);
private:
void executeROOT(TString obj_class,TString class_fun,TString args);
void printRegion(TString chrom,int start,int end,
bool useATcorr,bool useGCcorr);
void generateView(bool useATcorr,bool useGCcorr);
void generateView(TString chrom,int start,int end,
bool useATcorr,bool useGCcorr,
string *files = NULL,int win = -1);
void drawHistograms(TString chrom,int start,int end,int win,TString title,
TVirtualPad *pad,TH1 *raw,TH1 *his,TH1 *hisc,
TH1 *hisp,TH1* hism);
void generateViewBAF(TString chrom,int start,int end,
bool useATcorr,bool useGCcorr);
void drawHistogramsBAF(TString chrom,int start,int end,int win,TString title,
TVirtualPad *pad, TH1 *main, TTree *vcftreeidvar);
bool parseInput(TString &input,
TString &chrom,TString &start,TString &end,TString &option);
// Auxilary calculations
private:
bool isSexChrom(string chr);
double getInverse(int n);
double getSqrt(int n);
TF1 *getTFunction(int n);
void getAverageVariance(double *rd,int start,int stop,
double &average,double &variance,int &n);
double testRegion(double value,double m,double s,int n);
double testTwoRegions(double m1,double s1,int n1,double m2,double s2,int n2,
double scale);
double getEValue(double mean,double sigma,double *rd,int start,int end);
bool sameLevel(double l1, double l2);
bool getRegionLeft(double *level,int n_bins,int bin,int &start,int &stop);
bool getRegionRight(double *level,int n_bins,int bin,int &start,int &stop);
double getMean(TH1 *his);
bool adjustToEValue(double mean,double sigma,double *rd,int n_bins,
int &start,int &end,double eval);
int countGCpercentage(const char *seq,int low,int up);
double gaussianEValue(double mean,double sigma,double *rd,int start,int end);
int getChromNamesWithHis(string *names,bool useATcorr,bool useGCcorr);
int getChromNamesWithTree(string *names,string rfn = "",bool vcf = false);
int getChromLenWithTree(string name,string rfn = "");
public:
void getMeanSigma(TH1 *his,double &mean,double &sigma,bool dofit=true);
};
#endif