import pandas as pd
import numpy as np
import jass_preprocessing.dna_utils as dna_u
import warnings
import scipy.stats as ss
import seaborn as sns
import matplotlib.pyplot as plt
[docs]
def compute_z_score(mgwas):
"""
Compute zscore value and sign1
add the corresponding column to the mgwas dataframe
the smallest positive value of float
sys.float_info.min
2.2250738585072014e-308
the biggest Z score
np.sqrt(ss.chi2.isf(sys.float_info.min, 1))
37.537836095576054
"""
print(mgwas.columns)
mgwas["computed_z"] = np.sqrt(ss.chi2.isf(mgwas['pval'], 1))
if 'beta_or_Z' in mgwas.columns:
sign_vect = np.sign(mgwas.beta_or_Z)
mgwas["computed_z"] = mgwas["computed_z"].replace(np.inf, 37.537836095576054)
else:
if "OR" in mgwas.columns:
sign_vect = np.sign(mgwas["OR"] - 1.0 + 10**(-8))
mgwas["computed_z"] = mgwas["computed_z"].replace(np.inf, 37.537836095576054)
else:
raise ValueError(
'The gwas data frame doesn"t contain effect column')
mgwas["computed_z"] = mgwas["computed_z"] * sign_vect * mgwas["sign_flip"]
return mgwas
[docs]
def compute_sample_size(mgwas, diagnostic_folder, trait, perSS = 0.7):
if 'n' in mgwas.columns:
myN = mgwas.n
#--- freq, case-cont N exist
elif(('Ncase' in mgwas.columns) & ('Ncontrol' in mgwas.columns)):
sumN = mgwas.ncas + mgwas.ncont
perCase = mgwas.ncas / sumN
myN = sumN * perCase * (1-perCase)
#---se, freq exist and QT
elif(('se' in mgwas.columns) & ('freq' in mgwas.columns)):
myN = 1/(mgwas.freq * (1-mgwas.freq)* 2 * mgwas.se**2)
#---se exists but no freq, // use ref panel freq
elif(('se' in mgwas.columns) & ('freq' not in mgwas.columns)):
myN = 1/(mgwas.MAF * (1-mgwas.MAF)* 2 * mgwas.se**2)
else:
raise ValueError(
'Both Sample size and SE(beta) are missing: sample size filtering cannot be applied')
# replace infinite value by the max finite one
if(np.isinf(myN).any()):
myN[np.isinf(myN)] = np.max(myN[np.isfinite(myN)])
warnings.warn("Some snp had an infinite sample size")
myW_thres = np.percentile(myN.dropna(), 90)
ss_thres = np.double(perSS) * myW_thres
mgwas["computed_N"] = myN
plt.clf()
p1 = sns.distplot(mgwas.computed_N[~mgwas.computed_N.isna()])
p1.axvline(x=ss_thres)
fo = "{0}/Sample_size_distribution_{1}.png".format(diagnostic_folder, trait)
p1.figure.savefig(fo)
# Filter SNP with a too small sample _SampleSize
print("NSNP before sample size filtering: {}".format(mgwas.shape[0]))
mgwas = mgwas.loc[(myN >= ss_thres)]
mgwas = mgwas.loc[~mgwas.computed_N.isna()]
print("NSNP after sample size filtering: {}".format(mgwas.shape[0]))
return(mgwas)