Tutorial 2: scRNA-seq + Classification Analysis
This tutorial demonstrates a complete workflow for integrating single-cell RNA-seq (scRNA-seq) data with bulk RNA-seq data to perform a “Classification” analysis.
The example uses Melanoma (SKCM) data to identify cell populations associated with immunotherapy response.
Full Python Script
This file is available in your repository at Example/SC-Response-SKCM.py. The full code is included here for reference.
1# Example for integrate single-cell RNA-seq data of melanoma and response information
2import warnings
3warnings.filterwarnings("ignore")
4
5import torch
6import pickle
7import os
8
9from tirank.Model import setup_seed, initial_model_para
10from tirank.LoadData import *
11from tirank.SCSTpreprocess import *
12from tirank.GPextractor import GenePairExtractor
13from tirank.Dataloader import generate_val, PackData
14from tirank.TrainPre import tune_hyperparameters, Predict
15from tirank.Visualization import plot_score_distribution, DEG_analysis, DEG_volcano, Pathway_Enrichment
16from tirank.Visualization import plot_score_umap, plot_label_distribution_among_conditions
17
18setup_seed(619)
19
20## 1. Load data
21# 1.1 selecting a path to save the results
22savePath = "./Example/SKCM_SC_Res_experiment"
23savePath_1 = os.path.join(savePath, "1_loaddata")
24if not os.path.exists(savePath_1):
25 os.makedirs(savePath_1, exist_ok=True)
26
27dataPath = "./data/ExampleData/SKCM_SC_Res"
28
29# 1.2 load clinical data
30path_to_bulk_cli = os.path.join(dataPath, "Liu2019_meta.csv")
31bulkClinical = load_bulk_clinical(path_to_bulk_cli)
32view_dataframe(bulkClinical)
33
34# 1.3 load bulk expression profile
35path_to_bulk_exp = os.path.join(dataPath, "Liu2019_exp.csv")
36bulkExp = load_bulk_exp(path_to_bulk_exp)
37bulkExp = normalize_data(bulkExp)
38view_dataframe(bulkExp) ## if user try to view the data
39
40# 1.4 Check name
41check_bulk(savePath, bulkExp, bulkClinical)
42
43# 1.5 load SC data
44path_to_sc_floder = (os.path.join(dataPath,"GSE120575.h5ad"))
45scAnndata = load_sc_data(path_to_sc_floder, savePath)
46st_exp_df = transfer_exp_profile(scAnndata)
47view_dataframe(st_exp_df) ## if user try to view the data
48
49## 2. Preprocessing
50# 2.1 selecting a path to save the results
51savePath_1 = os.path.join(savePath, "1_loaddata")
52savePath_2 = os.path.join(savePath, "2_preprocessing")
53
54if not os.path.exists(savePath_2):
55 os.makedirs(savePath_2, exist_ok=True)
56
57# 2.2 load data
58f = open(os.path.join(savePath_1, "anndata.pkl"), "rb")
59scAnndata = pickle.load(f)
60f.close()
61
62# 2.3 Preprocessing on sc/st data
63infer_mode = "Cell" ## optional parameter
64
65scAnndata = FilteringAnndata(
66 scAnndata,
67 max_count=35000,
68 min_count=3,
69 MT_propor=10,
70 min_cell=1,
71 imgPath=savePath_2,
72) ## optional parameters: max_count, min_count, MT_propor, min_cell
73scAnndata = Normalization(scAnndata)
74scAnndata = Logtransformation(scAnndata)
75scAnndata = Clustering(scAnndata, infer_mode=infer_mode, savePath=savePath)
76compute_similarity(savePath=savePath, ann_data=scAnndata)
77
78with open(os.path.join(savePath_2, "scAnndata.pkl"), "wb") as f:
79 pickle.dump(scAnndata, f)
80f.close()
81
82# 2.4 clinical column selection and bulk data split
83mode = "Classification"
84
85# data split
86generate_val(
87 savePath=savePath, validation_proportion=0.15, mode=mode
88) ## optinal parameter: validation_proportion
89
90# sampling
91perform_sampling_on_RNAseq(savePath=savePath,mode="SMOTE", threshold=0.5)
92
93# 2.5 Genepair Transformation
94GPextractor = GenePairExtractor(
95 savePath=savePath,
96 analysis_mode=mode,
97 top_var_genes=2000,
98 top_gene_pairs=1000,
99 p_value_threshold=0.05,
100 max_cutoff=0.8,
101 min_cutoff=-0.8,
102) ## optinal parameter: top_var_genes, top_gene_pairs, padj_value_threshold, padj_value_threshold
103
104GPextractor.load_data()
105GPextractor.run_extraction()
106GPextractor.save_data()
107
108## 3. Analysis
109# 3.1 TiRank
110savePath_1 = os.path.join(savePath, "1_loaddata")
111savePath_2 = os.path.join(savePath, "2_preprocessing")
112savePath_3 = os.path.join(savePath, "3_Analysis")
113
114if not os.path.exists(savePath_3):
115 os.makedirs(savePath_3, exist_ok=True)
116
117
118# 3.1.1 Dataloader
119mode = "Classification"
120infer_mode = "SC"
121device = "cuda" if torch.cuda.is_available() else "cpu"
122
123PackData(savePath, mode=mode, infer_mode=infer_mode, batch_size=1024)
124
125# 3.1.2 Training
126encoder_type = "MLP" ## Optional parameter
127
128# Model parameter
129initial_model_para(
130 savePath=savePath,
131 nhead=2,
132 nhid1=96,
133 nhid2=8,
134 n_output=32,
135 nlayers=3,
136 n_pred=2,
137 dropout=0.5,
138 mode=mode,
139 encoder_type=encoder_type,
140 infer_mode=infer_mode,
141)
142
143tune_hyperparameters(
144 ## Parameters Path
145 savePath=savePath,
146 device=device,
147 n_trials=5,
148) ## optional parameters: n_trials
149
150# 3.1.3 Inference
151Predict(savePath=savePath, mode=mode, do_reject=True, tolerance=0.05, reject_mode="GMM")
152
153# 3.1.4 Visualization
154plot_score_distribution(savePath) # Display the prob score distribution
155plot_score_umap(savePath,infer_mode)
156plot_label_distribution_among_conditions(savePath,group="leiden_clusters")
157
158# 3.2 DEGs and Pathway enrichment
159fc_threshold = 2
160Pvalue_threshold = 0.05
161do_p_adjust = False
162
163DEG_analysis(
164 savePath,
165 fc_threshold=fc_threshold,
166 Pvalue_threshold=Pvalue_threshold,
167 do_p_adjust=do_p_adjust,
168)
169DEG_volcano(
170 savePath,
171 fc_threshold=fc_threshold,
172 Pvalue_threshold=Pvalue_threshold,
173 do_p_adjust=do_p_adjust,
174)
175
176# database = ["KEGG_2021_Human","MSigDB_Hallmark_2020","GO_Biological_Process_2023"]
177# refer to https://maayanlab.cloud/Enrichr/#libraries
178
179Pathway_Enrichment(savePath, database=["GO_Biological_Process_2023"])
180