Tutorial 2: scRNA-seq + Classification Analysis

This tutorial demonstrates a complete workflow for integrating single-cell RNA-seq (scRNA-seq) data with bulk RNA-seq data to perform a “Classification” analysis.

The example uses Melanoma (SKCM) data to identify cell populations associated with immunotherapy response.

Full Python Script

This file is available in your repository at Example/SC-Response-SKCM.py. The full code is included here for reference.

  1# Example for integrate single-cell RNA-seq data of melanoma and response information
  2import warnings
  3warnings.filterwarnings("ignore")
  4
  5import torch
  6import pickle
  7import os
  8
  9from tirank.Model import setup_seed, initial_model_para
 10from tirank.LoadData import *
 11from tirank.SCSTpreprocess import *
 12from tirank.GPextractor import GenePairExtractor
 13from tirank.Dataloader import generate_val, PackData
 14from tirank.TrainPre import tune_hyperparameters, Predict
 15from tirank.Visualization import plot_score_distribution, DEG_analysis, DEG_volcano, Pathway_Enrichment
 16from tirank.Visualization import plot_score_umap, plot_label_distribution_among_conditions
 17
 18setup_seed(619)
 19
 20## 1. Load data
 21# 1.1 selecting a path to save the results
 22savePath = "./Example/SKCM_SC_Res_experiment"
 23savePath_1 = os.path.join(savePath, "1_loaddata")
 24if not os.path.exists(savePath_1):
 25    os.makedirs(savePath_1, exist_ok=True)
 26
 27dataPath = "./data/ExampleData/SKCM_SC_Res"
 28
 29# 1.2 load clinical data
 30path_to_bulk_cli = os.path.join(dataPath, "Liu2019_meta.csv")
 31bulkClinical = load_bulk_clinical(path_to_bulk_cli)
 32view_dataframe(bulkClinical)
 33
 34# 1.3 load bulk expression profile
 35path_to_bulk_exp = os.path.join(dataPath, "Liu2019_exp.csv")
 36bulkExp = load_bulk_exp(path_to_bulk_exp)
 37bulkExp = normalize_data(bulkExp)
 38view_dataframe(bulkExp)  ## if user try to view the data
 39
 40# 1.4 Check name
 41check_bulk(savePath, bulkExp, bulkClinical)
 42
 43# 1.5 load SC data
 44path_to_sc_floder = (os.path.join(dataPath,"GSE120575.h5ad"))
 45scAnndata = load_sc_data(path_to_sc_floder, savePath)
 46st_exp_df = transfer_exp_profile(scAnndata)
 47view_dataframe(st_exp_df)  ## if user try to view the data
 48
 49## 2. Preprocessing
 50# 2.1 selecting a path to save the results
 51savePath_1 = os.path.join(savePath, "1_loaddata")
 52savePath_2 = os.path.join(savePath, "2_preprocessing")
 53
 54if not os.path.exists(savePath_2):
 55    os.makedirs(savePath_2, exist_ok=True)
 56
 57# 2.2 load data
 58f = open(os.path.join(savePath_1, "anndata.pkl"), "rb")
 59scAnndata = pickle.load(f)
 60f.close()
 61
 62# 2.3 Preprocessing on sc/st data
 63infer_mode = "Cell"  ## optional parameter
 64
 65scAnndata = FilteringAnndata(
 66    scAnndata,
 67    max_count=35000,
 68    min_count=3,
 69    MT_propor=10,
 70    min_cell=1,
 71    imgPath=savePath_2,
 72)  ## optional parameters: max_count, min_count, MT_propor, min_cell
 73scAnndata = Normalization(scAnndata)
 74scAnndata = Logtransformation(scAnndata)
 75scAnndata = Clustering(scAnndata, infer_mode=infer_mode, savePath=savePath)
 76compute_similarity(savePath=savePath, ann_data=scAnndata)
 77
 78with open(os.path.join(savePath_2, "scAnndata.pkl"), "wb") as f:
 79    pickle.dump(scAnndata, f)
 80f.close()
 81
 82# 2.4 clinical column selection and bulk data split
 83mode = "Classification"
 84
 85# data split
 86generate_val(
 87    savePath=savePath, validation_proportion=0.15, mode=mode
 88)  ## optinal parameter: validation_proportion
 89
 90# sampling
 91perform_sampling_on_RNAseq(savePath=savePath,mode="SMOTE", threshold=0.5)
 92
 93# 2.5 Genepair Transformation
 94GPextractor = GenePairExtractor(
 95    savePath=savePath,
 96    analysis_mode=mode,
 97    top_var_genes=2000,
 98    top_gene_pairs=1000,
 99    p_value_threshold=0.05,
100    max_cutoff=0.8,
101    min_cutoff=-0.8,
102)  ## optinal parameter: top_var_genes, top_gene_pairs, padj_value_threshold, padj_value_threshold
103
104GPextractor.load_data()
105GPextractor.run_extraction()
106GPextractor.save_data()
107
108## 3. Analysis
109# 3.1 TiRank
110savePath_1 = os.path.join(savePath, "1_loaddata")
111savePath_2 = os.path.join(savePath, "2_preprocessing")
112savePath_3 = os.path.join(savePath, "3_Analysis")
113
114if not os.path.exists(savePath_3):
115    os.makedirs(savePath_3, exist_ok=True)
116
117
118# 3.1.1 Dataloader
119mode = "Classification"
120infer_mode = "SC"
121device = "cuda" if torch.cuda.is_available() else "cpu"
122
123PackData(savePath, mode=mode, infer_mode=infer_mode, batch_size=1024)
124
125# 3.1.2 Training
126encoder_type = "MLP"  ## Optional parameter
127
128# Model parameter
129initial_model_para(
130    savePath=savePath,
131    nhead=2,
132    nhid1=96,
133    nhid2=8,
134    n_output=32,
135    nlayers=3,
136    n_pred=2,
137    dropout=0.5,
138    mode=mode,
139    encoder_type=encoder_type,
140    infer_mode=infer_mode,
141)
142
143tune_hyperparameters(
144    ## Parameters Path
145    savePath=savePath,
146    device=device,
147    n_trials=5,
148)  ## optional parameters: n_trials
149
150# 3.1.3 Inference
151Predict(savePath=savePath, mode=mode, do_reject=True, tolerance=0.05, reject_mode="GMM")
152
153# 3.1.4 Visualization
154plot_score_distribution(savePath)  # Display the prob score distribution
155plot_score_umap(savePath,infer_mode)
156plot_label_distribution_among_conditions(savePath,group="leiden_clusters")
157
158# 3.2 DEGs and Pathway enrichment
159fc_threshold = 2
160Pvalue_threshold = 0.05
161do_p_adjust = False
162
163DEG_analysis(
164    savePath,
165    fc_threshold=fc_threshold,
166    Pvalue_threshold=Pvalue_threshold,
167    do_p_adjust=do_p_adjust,
168)
169DEG_volcano(
170    savePath,
171    fc_threshold=fc_threshold,
172    Pvalue_threshold=Pvalue_threshold,
173    do_p_adjust=do_p_adjust,
174)
175
176# database = ["KEGG_2021_Human","MSigDB_Hallmark_2020","GO_Biological_Process_2023"]
177# refer to https://maayanlab.cloud/Enrichr/#libraries
178
179Pathway_Enrichment(savePath, database=["GO_Biological_Process_2023"])
180