Ask for help
Closed this issue · 2 comments
Dear Dr, thank you very much for your very great work. I want to get the true value for human of each subclass. Actually, I can get the value for each subclass, but I cannot find out the corresponding relationship between genes and these values.
The code looks like this
import os
import warnings
import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
from scipy.stats import pearsonr
import seaborn as sns
import h5py
filename = '/data/users/houruiyan/variant_TSS/test_scXpresso/human/M1/class/class.csv'
filename
classes = np.array(pd.read_csv(filename, index_col=0).columns, dtype='str')
classes
filename = '/data/users/houruiyan/variant_TSS/test_scXpresso/human/M1/subclass/subclass.csv'
subclasses = np.array(pd.read_csv(filename, index_col=0).columns, dtype='<U20')
subclasses
ct_mapping = np.vstack([subclasses, subclasses, subclasses])
ct_mapping[0,:] = 'All'
ct_mapping[1,[0,1,11,12,13,18]] = 'Non-neuronal'
ct_mapping[1,[10,14,15,16,17,19]] = 'GABAergic'
ct_mapping[1,[2,3,4,5,6,7,8,9]] = 'Glutamatergic'
ct_mapping
corr_all = pd.DataFrame(np.zeros((len(subclasses)*20*3,3)),
columns = ['Model', 'Cell population', 'Corr'])
corr_all = corr_all.astype({"Model": str, "Cell population": str})
corr_all
y_true_all = pd.DataFrame(np.zeros((18138, 20)), columns=subclasses)
y_pred_all = pd.DataFrame(np.zeros((18138, 20)), columns=subclasses)
y_pred_all2 = pd.DataFrame(np.zeros((18138, 20)), columns=subclasses)
y_pred_all3 = pd.DataFrame(np.zeros((18138, 20)), columns=subclasses)
fold = pd.DataFrame(np.zeros((18138, 20)), columns=subclasses)
y_true_all
y_pred_all
fold
corr_all = pd.DataFrame(np.zeros((len(subclasses)*20*3,3)),
columns = ['Model', 'Cell population', 'Corr'])
corr_all = corr_all.astype({"Model": str, "Cell population": str})
y_true_all = pd.DataFrame(np.zeros((18138, 20)), columns=subclasses)
y_pred_all = pd.DataFrame(np.zeros((18138, 20)), columns=subclasses)
y_pred_all2 = pd.DataFrame(np.zeros((18138, 20)), columns=subclasses)
y_pred_all3 = pd.DataFrame(np.zeros((18138, 20)), columns=subclasses)
fold = pd.DataFrame(np.zeros((18138, 20)), columns=subclasses)
count_all=0
for i, ct in enumerate(subclasses):
y_true_all_ = []
y_pred_all_ = []
y_pred_all2_ = []
y_pred_all3_ = []
fold_ = []
total_rerun = 0
for j in range(20):
y_pred_sc = 0
y_pred_c = 0
y_pred_t = 0
for k in range(5):
file_sc = '/data/users/houruiyan/variant_TSS/test_scXpresso/human/M1/subclass/logmean_multitask_' + str(j) + '/logs_dir' + str(k) + '/results_testdata_best.pkl'
y = pd.read_pickle(file_sc)
y_true = np.asarray(y['y_true'])[:,i]
y_pred = np.asarray(y['y_pred'])[:,i]
y_pred_sc = y_pred_sc + y_pred/5
for k in range(5):
file_c = '/data/users/houruiyan/variant_TSS/test_scXpresso/human/M1/class/logmean_multitask_' + str(j) + '/logs_dir' + str(k) + '/results_testdata_best.pkl'
y = pd.read_pickle(file_c)
pop_c = ct_mapping[1,i]
idx_c = np.where(classes == pop_c)[0]
y_pred = np.asarray(y['y_pred'])[:,idx_c[0]]
y_pred_c += y_pred/5
for k in range(5):
file_t = '/data/users/houruiyan/variant_TSS/test_scXpresso/human/M1/tissue/logmean_All_' + str(j) + '/logs_dir' + str(k) + '/results_testdata_best.pkl'
y = pd.read_pickle(file_t)
y_pred = np.asarray(y['y_pred'])
y_pred_t += y_pred/5
corr_sc, _ = pearsonr(y_true, y_pred_sc)
corr_c, _ = pearsonr(y_true, y_pred_c)
corr_t, _ = pearsonr(y_true, y_pred_t)
corr_all['Model'].values[count_all] = 'Subclass'
corr_all['Cell population'].values[count_all] = ct
corr_all['Corr'].values[count_all] = corr_sc
count_all += 1
corr_all['Model'].values[count_all] = 'Class'
corr_all['Cell population'].values[count_all] = ct
corr_all['Corr'].values[count_all] = corr_c
count_all += 1
corr_all['Model'].values[count_all] = 'Tissue'
corr_all['Cell population'].values[count_all] = ct
corr_all['Corr'].values[count_all] = corr_t
count_all += 1
y_true_all_.extend(y_true)
y_pred_all_.extend(y_pred_sc)
y_pred_all2_.extend(y_pred_c)
y_pred_all3_.extend(y_pred_t)
fold_.extend(np.ones_like(y_true)*j)
y_true_all[ct] = y_true_all_
y_pred_all[ct] = y_pred_all_
y_pred_all2[ct] = y_pred_all2_
y_pred_all3[ct] = y_pred_all3_
fold[ct] = fold_
y_true_all
plt.hist(y_true_all['Astro'],bins=100)
plt.xlabel('Y True value')
plt.ylabel('Frequency')
plt.title('Astro')
plt.show()
y_true_all.to_csv('/data/users/houruiyan/variant_TSS/test_scXpresso/human/M1/celltype_true_y_value.csv')
The y value what I get is looking like this
But it did not include the gene name, do these gene names have the same order with the human_seq_test.h5?
Hope to get your reply. Thank you!
Best
Hi!
The order of the gene names changed compared to human_seq_test.h5 because of the 10-fold cross-validation. If you go to this notebook and scroll to the section 'Link gene names to the predictions' it shows how to match the gene name to the predictions. Make sure that the random_state which is used to create the train/val/test sets is the same as the random_state when linking the gene names (I always used random_state=1).
Hope this helps!
Thank you very much for your quick reply!!