transformApp/script.py at main · gracefeng28/transformApp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#import modules
import os
import pandas as pd
from scipy import stats
import numpy as np
import csv
import seaborn as sns
import matplotlib.pyplot as plt
import shutil
import sys

#input validation (should be in the form: [program] [sample file] [type of transformation])
#if none provided, default to box cox and provided dataset
test_file_name = "GWAS_Ionomics_Phenotype.txt"
normalization_type = "box_cox"

if (len(sys.argv)==3):
    if (os.path.exists(sys.argv[1])):
        test_file_name = sys.argv[1]
    else:
        print("Sorry, please enter a valid input file, or run with default parameters")
        sys.exit()
    if ((sys.argv[2].lower()!= "box_cox" and sys.argv[2].lower()!= "sqrt") and sys.argv[2].lower()!= "log"):
        print("Sorry, please enter a valid transformation type, or run with default parameters")
        sys.exit()
    else:
        normalization_type = sys.argv[2].lower();
if (os.path.isdir("plots/")== False):
    os.mkdir("plots/")
#uncomment if you want to delete all the plot files after each run
#else:
    #shutil.rmtree("plots/")
    #os.mkdir("plots/")

#read in sample ionomics data
df = pd.read_csv(test_file_name, delimiter= "\t")
cols = list((df.columns))[1:]

output_tsv = "outputs/" + normalization_type+".tsv"
tsv_header_name = list((df.columns))[0:]

each_row = [df.loc[:,str(list((df.columns))[0])]]

def is_continous(trait):
    data = (list(df.loc[:,str(trait)]))
    #filter out missing values
    filter_nan = [x for x in data if not np.isnan(x)]
    unique_elements = set()
    for elem in filter_nan:
        if (type(elem)== bool):
            return False
        unique_elements.add(elem)
    if (len(unique_elements)<=2):
        return False
    return True
#shows plot while code is running
def show_plot(old_data,new_data):
    transformed_df = pd.DataFrame({'count':new_data})
    old_df = pd.DataFrame({'count':old_data})
    sns.displot(old_df)
    sns.displot(transformed_df)
    plt.show()

# save all graphs to folder corresponding with trait
def save_plot(data, data_name,age,transform_type):
    df = pd.DataFrame(data)
    sns.displot(df)
    if (os.path.isdir("plots/"+data_name)!= True):
        os.mkdir("plots/"+data_name)
    plt.savefig("plots/"+data_name + "/" + age+ "_" + data_name+"_" + transform_type +".png")
    plt.close()

def run_boxcox(trait):
    data = (list(df.loc[:,str(trait)]))
    #filter out missing values
    filter_nan = [x for x in data if not np.isnan(x)]
    #Box-Cox requires all positive values
    assert(min(filter_nan)>=0)
    #perform box cox transformation
    fitted_data, fitted_lambda = stats.boxcox(filter_nan)
    fitted_data = fitted_data.round(2)
    #show_plot(filter_nan,fitted_data)
    #save plots for old and new
    save_plot(filter_nan, trait,"old","bc")
    save_plot(fitted_data, trait,"new", "bc")
    each_row.append(fitted_data)
    print(f"Lambda value used for Transformation: {fitted_lambda}")


def run_sqrt(trait):
    data = (df.loc[:,str(trait)])
    #filter out missing values
    filter_nan = [x for x in data if not np.isnan(x)]
    #perform square root transformation
    sqrt_output = np.sqrt(filter_nan)
    #show_plot(filter_nan,sqrt_output)
    save_plot(filter_nan, trait,"old", "sqrt")
    save_plot(sqrt_output, trait,"new", "sqrt")
    each_row.append(sqrt_output)

def run_log(trait):
    data = (df.loc[:,str(trait)])
    #filter out missing values
    filter_nan = [x for x in data if not np.isnan(x)]
    #perform square root transformation
    log_output = np.log(filter_nan)
    #show_plot(filter_nan,sqrt_output)
    save_plot(filter_nan, trait,"old", "log")
    save_plot(log_output, trait,"new", "log")
    each_row.append(log_output)

print(f"Performing {normalization_type } transformation on {test_file_name}")
#call transformation for each row
for col in cols:
    #first checks if data is continuous, if not prints to screen
    if (is_continous(col)):
        print("Currently transforming: " +col)
        if (normalization_type == "box_cox"):
            run_boxcox(col)
        elif (normalization_type == "sqrt"):
            run_sqrt(col)
        else:
            run_log(col)
    else:
        print("Could not process the following trait (not continuous): " + col)
        #add column to output tsv without transforming
        bin_data = (df.loc[:,str(col)])
        each_row.append(bin_data)

#rotate rows to become columns
each_row_df = pd.DataFrame(each_row)
each_row_df=each_row_df.transpose()

with open(output_tsv, 'w') as tsvfile:
    #csv writer to write in tsv file
    tsv_writer = csv.writer(tsvfile, delimiter='\t')
    #write header in tsv file
    tsv_writer.writerow(tsv_header_name)
    #write rows
    tsv_writer.writerows(each_row_df.values.tolist())
    #close csv file
    tsvfile.close()
    pass