Source code for jass_preprocessing.compute_score

import pandas as pd
import numpy as np
import jass_preprocessing.dna_utils as dna_u
import warnings
import scipy.stats as ss
import seaborn as sns
import matplotlib.pyplot as plt




[docs]
def compute_z_score(mgwas):
    """
    Compute zscore value and sign1
    add the corresponding column to the mgwas dataframe
    
    the smallest positive value of float
    sys.float_info.min
    2.2250738585072014e-308

    the biggest Z score
    np.sqrt(ss.chi2.isf(sys.float_info.min, 1))
    37.537836095576054
    """
    
    print(mgwas.columns)
    mgwas["computed_z"] = np.sqrt(ss.chi2.isf(mgwas['pval'], 1))


    if 'beta_or_Z' in mgwas.columns:
        sign_vect = np.sign(mgwas.beta_or_Z)
        mgwas["computed_z"] = mgwas["computed_z"].replace(np.inf, 37.537836095576054)
    else:
        if "OR" in mgwas.columns:
            sign_vect = np.sign(mgwas["OR"] - 1.0 + 10**(-8))
            mgwas["computed_z"] = mgwas["computed_z"].replace(np.inf, 37.537836095576054)
        else:
            raise ValueError(
                'The gwas data frame doesn"t contain effect column')

    mgwas["computed_z"] = mgwas["computed_z"] * sign_vect * mgwas["sign_flip"]

    return mgwas



[docs]
def compute_sample_size(mgwas, diagnostic_folder, trait, perSS = 0.7):

    if 'n' in mgwas.columns:
        myN = mgwas.n
    #--- freq, case-cont N exist
    elif(('Ncase' in mgwas.columns) & ('Ncontrol' in mgwas.columns)):
        sumN = mgwas.ncas + mgwas.ncont
        perCase = mgwas.ncas / sumN
        myN = sumN * perCase * (1-perCase)
    #---se, freq exist and QT
    elif(('se' in mgwas.columns) & ('freq' in mgwas.columns)):
        myN = 1/(mgwas.freq * (1-mgwas.freq)* 2 * mgwas.se**2)
    #---se exists but no freq, // use ref panel freq
    elif(('se' in mgwas.columns) & ('freq' not in mgwas.columns)):
        myN = 1/(mgwas.MAF * (1-mgwas.MAF)* 2 * mgwas.se**2)
    else:
        raise ValueError(
            'Both Sample size and SE(beta) are missing: sample size filtering cannot be applied')
    # replace infinite value by the max finite one
    if(np.isinf(myN).any()):
        myN[np.isinf(myN)] = np.max(myN[np.isfinite(myN)])
        warnings.warn("Some snp had an infinite sample size")

    myW_thres = np.percentile(myN.dropna(), 90)
    ss_thres = np.double(perSS) * myW_thres
    mgwas["computed_N"] = myN
    plt.clf()
    p1 = sns.distplot(mgwas.computed_N[~mgwas.computed_N.isna()])
    p1.axvline(x=ss_thres)
    fo = "{0}/Sample_size_distribution_{1}.png".format(diagnostic_folder, trait)
    p1.figure.savefig(fo)
    # Filter SNP with a too small sample _SampleSize
    print("NSNP before sample size filtering: {}".format(mgwas.shape[0]))
    mgwas = mgwas.loc[(myN >= ss_thres)]
    mgwas = mgwas.loc[~mgwas.computed_N.isna()]

    print("NSNP after sample size filtering: {}".format(mgwas.shape[0]))
    return(mgwas)