Source code for jass_preprocessing.map_gwas

"""
Map GWAS

A set of functions to find GWAS files in subfolder and
to map columns

"""

import os
import sys
import pandas as pd
import numpy as np
import gzip
import re

[docs] def walkfs(startdir, findfile): """ Go through the folder and subfolder to find the specified file Args: startdir (str): path of the folder to explore findfile (str): name of the file to find """ dircount = 0 filecount = 0 for root, dirs, files in os.walk(startdir): if findfile in files: return dircount, filecount + files.index(findfile), os.path.join(root, findfile) dircount += 1 filecount += len(files) # nothing found, return None instead of a full path for the file return dircount, filecount, None
[docs] def convert_missing_values(df): """ Convert all missing value strings to a standart np.nan value Args: GWAS_table (pandas dataframe): GWAS data as a dataframe Return: a pandas dataframe with missing value all equal to np.nan """ def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'nan', 'na', '.'] nmissing = len(def_missing) nan_vec = ["NA"] * nmissing return df.replace(def_missing, nan_vec)
[docs] def map_columns_position(gwas_internal_link, column_dict): """ Find column position for each specific Gwas Args: gwas_internal_link (str): filename of the GWAS data (with path) GWAS_labels (pd.DataFrame): corresponding row of the information file Return: pandas Series with column position and column names as index """ gwas_file = gwas_internal_link.split('/')[-1] #Our standart labels: reference_label = column_dict.index.tolist() # labels in the GWAS files target_lab = pd.Index(column_dict.values.tolist()) is_gzipped = re.search(r".gz$", gwas_internal_link) if is_gzipped: f = gzip.open(gwas_internal_link) line = f.readline() line = line.decode('utf-8') else: f = open(gwas_internal_link) line = f.readline() count_line = 0 header = pd.Index(line.split()) def get_position(I,x): try: position_in_header = I.get_loc(x) if isinstance(position_in_header, int): return position_in_header else: raise IndexError("{0} is a not corresponding to an unique column in {1}. Check that column names are unique in the header of {1} Summary Statistics".format(x, gwas_file)) except KeyError: return np.nan label_position = [get_position(header,i) for i in target_lab] mapgw = pd.Series(label_position, index=reference_label) mapgw = mapgw.loc[~mapgw.isna()].astype(int) mapgw.sort_values(inplace=True) f.close() return mapgw
[docs] def read_gwas( gwas_internal_link, column_map, imputation_treshold=None): """ Read gwas raw data, fetch columns thanks to position stored in column_map and rename columns according to column_map.index Args: gwas_internal_link (str): GWAS data as a dataframe column_map (pandas Series): Series containing the position of column in the raw data Return: a pandas dataframe with missing value all equal to np.nan """ print("Reading file:") print(gwas_internal_link) is_gzipped = re.search(r".gz|.bgz$", gwas_internal_link) if is_gzipped: compression = 'gzip' else: compression = None print(column_map.values) print(column_map.index) fullGWAS = pd.read_csv(gwas_internal_link, delim_whitespace=True, usecols = column_map.values, compression=compression, names= column_map.index, header=0, na_values= ['', '#N/A', '#N/A', 'N/A','#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'nan', 'na', '.', '-'], dtype={"snpid":str, "a1":str,"a2":str,"freq":np.double, "z":np.double,"se":np.double, "pval":np.double}) print(fullGWAS.head()) #Ensure that allele are written in upper cases: fullGWAS.a1 = fullGWAS.a1.str.upper() fullGWAS.a2 = fullGWAS.a2.str.upper() def sorted_alleles(x): return "".join(sorted(x)) # either rs ID or full position must be available: if not(("CHR" in fullGWAS.columns) and ("POS" in fullGWAS.columns)) and ("snpid" not in fullGWAS.columns): raise ValueError("Summary statistic file {0} doesn't contain rsID or full SNP position".format(gwas_internal_link)) if ("CHR" in fullGWAS.columns) and ("POS" in fullGWAS.columns): fullGWAS["positional_index"] = fullGWAS.CHR.apply(str)+fullGWAS.POS.apply(str)+(fullGWAS.a1+fullGWAS.a2).apply(sorted_alleles) if ("snpid" not in fullGWAS.columns): fullGWAS["snpid"] = fullGWAS["positional_index"] fullGWAS.set_index("snpid", inplace=True) print(fullGWAS.head()) if imputation_treshold: fullGWAS = fullGWAS.loc[fullGWAS.imputation_quality > imputation_treshold] fullGWAS = fullGWAS[~fullGWAS.index.duplicated(keep='first')] #fullGWAS = convert_missing_values(fullGWAS) return fullGWAS