Source code for jass_preprocessing.compute_score

import pandas as pd
import numpy as np
import jass_preprocessing.dna_utils as dna_u
import warnings
import scipy.stats as ss
import seaborn as sns
import matplotlib.pyplot as plt



[docs] def compute_z_score(mgwas): """ Compute zscore value and sign1 add the corresponding column to the mgwas dataframe the smallest positive value of float sys.float_info.min 2.2250738585072014e-308 the biggest Z score np.sqrt(ss.chi2.isf(sys.float_info.min, 1)) 37.537836095576054 """ print(mgwas.columns) mgwas["computed_z"] = np.sqrt(ss.chi2.isf(mgwas['pval'], 1)) if 'beta_or_Z' in mgwas.columns: sign_vect = np.sign(mgwas.beta_or_Z) mgwas["computed_z"] = mgwas["computed_z"].replace(np.inf, 37.537836095576054) else: if "OR" in mgwas.columns: sign_vect = np.sign(mgwas["OR"] - 1.0 + 10**(-8)) mgwas["computed_z"] = mgwas["computed_z"].replace(np.inf, 37.537836095576054) else: raise ValueError( 'The gwas data frame doesn"t contain effect column') mgwas["computed_z"] = mgwas["computed_z"] * sign_vect * mgwas["sign_flip"] return mgwas
[docs] def compute_sample_size(mgwas, diagnostic_folder, trait, perSS = 0.7): if 'n' in mgwas.columns: myN = mgwas.n #--- freq, case-cont N exist elif(('Ncase' in mgwas.columns) & ('Ncontrol' in mgwas.columns)): sumN = mgwas.ncas + mgwas.ncont perCase = mgwas.ncas / sumN myN = sumN * perCase * (1-perCase) #---se, freq exist and QT elif(('se' in mgwas.columns) & ('freq' in mgwas.columns)): myN = 1/(mgwas.freq * (1-mgwas.freq)* 2 * mgwas.se**2) #---se exists but no freq, // use ref panel freq elif(('se' in mgwas.columns) & ('freq' not in mgwas.columns)): myN = 1/(mgwas.MAF * (1-mgwas.MAF)* 2 * mgwas.se**2) else: raise ValueError( 'Both Sample size and SE(beta) are missing: sample size filtering cannot be applied') # replace infinite value by the max finite one if(np.isinf(myN).any()): myN[np.isinf(myN)] = np.max(myN[np.isfinite(myN)]) warnings.warn("Some snp had an infinite sample size") myW_thres = np.percentile(myN.dropna(), 90) ss_thres = np.double(perSS) * myW_thres mgwas["computed_N"] = myN plt.clf() p1 = sns.distplot(mgwas.computed_N[~mgwas.computed_N.isna()]) p1.axvline(x=ss_thres) fo = "{0}/Sample_size_distribution_{1}.png".format(diagnostic_folder, trait) p1.figure.savefig(fo) # Filter SNP with a too small sample _SampleSize print("NSNP before sample size filtering: {}".format(mgwas.shape[0])) mgwas = mgwas.loc[(myN >= ss_thres)] mgwas = mgwas.loc[~mgwas.computed_N.isna()] print("NSNP after sample size filtering: {}".format(mgwas.shape[0])) return(mgwas)