!pip install pandas seaborn scikit-learn matplotlib numpy requests xgboost gseapy

Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: pandas in /home/user/.local/lib/python3.10/site-packages (2.3.3)
Requirement already satisfied: seaborn in /home/user/.local/lib/python3.10/site-packages (0.13.2)
Requirement already satisfied: scikit-learn in /home/user/.local/lib/python3.10/site-packages (1.7.2)
Requirement already satisfied: matplotlib in /home/user/.local/lib/python3.10/site-packages (3.10.7)
Requirement already satisfied: numpy in /home/user/.local/lib/python3.10/site-packages (2.2.6)
Requirement already satisfied: requests in /usr/lib/python3/dist-packages (2.25.1)
Requirement already satisfied: xgboost in /home/user/.local/lib/python3.10/site-packages (3.1.1)
Requirement already satisfied: gseapy in /home/user/.local/lib/python3.10/site-packages (1.1.10)
Requirement already satisfied: python-dateutil>=2.8.2 in /home/user/.local/lib/python3.10/site-packages (from pandas) (2.9.0.post0)
Requirement already satisfied: tzdata>=2022.7 in /home/user/.local/lib/python3.10/site-packages (from pandas) (2025.2)
Requirement already satisfied: pytz>=2020.1 in /usr/lib/python3/dist-packages (from pandas) (2022.1)
Requirement already satisfied: threadpoolctl>=3.1.0 in /home/user/.local/lib/python3.10/site-packages (from scikit-learn) (3.6.0)
Requirement already satisfied: scipy>=1.8.0 in /home/user/.local/lib/python3.10/site-packages (from scikit-learn) (1.15.3)
Requirement already satisfied: joblib>=1.2.0 in /home/user/.local/lib/python3.10/site-packages (from scikit-learn) (1.5.2)
Requirement already satisfied: pyparsing>=3 in /home/user/.local/lib/python3.10/site-packages (from matplotlib) (3.2.5)
Requirement already satisfied: packaging>=20.0 in /home/user/.local/lib/python3.10/site-packages (from matplotlib) (25.0)
Requirement already satisfied: pillow>=8 in /usr/lib/python3/dist-packages (from matplotlib) (9.0.1)
Requirement already satisfied: cycler>=0.10 in /home/user/.local/lib/python3.10/site-packages (from matplotlib) (0.12.1)
Requirement already satisfied: contourpy>=1.0.1 in /home/user/.local/lib/python3.10/site-packages (from matplotlib) (1.3.2)
Requirement already satisfied: kiwisolver>=1.3.1 in /home/user/.local/lib/python3.10/site-packages (from matplotlib) (1.4.9)
Requirement already satisfied: fonttools>=4.22.0 in /home/user/.local/lib/python3.10/site-packages (from matplotlib) (4.60.1)
Requirement already satisfied: nvidia-nccl-cu12 in /home/user/.local/lib/python3.10/site-packages (from xgboost) (2.28.7)
Requirement already satisfied: six>=1.5 in /home/user/.local/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)


import sys
import json
import time
import requests
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.decomposition import PCA


!wget https://gdc-hub.s3.us-east-1.amazonaws.com/download/GDC-PANCAN.htseq_fpkm-uq.tsv -O GDC-PANCAN.htseq_fpkm-uq.tsv
!wget https://gdc-hub.s3.us-east-1.amazonaws.com/download/GDC-PANCAN.TCGA_phenotype.tsv -O GDC-PANCAN.TCGA_phenotype.tsv
!wget https://gdc-hub.s3.us-east-1.amazonaws.com/download/GDC-PANCAN.gistic.tsv -O GDC-PANCAN.gistic.tsv


# Load RNA-seq and Phenotype data
expression_df = pd.read_csv('GDC-PANCAN.htseq_fpkm-uq.tsv', sep='\t')
phenotype_df = pd.read_csv('GDC-PANCAN.TCGA_phenotype.tsv', sep='\t')

# Prepare labels (cancer type)
phenotype_labels = phenotype_df[['sample', 'project.project_id']]
phenotype_labels = phenotype_labels.set_index('sample')

# Prepare features (gene expression)
# Set gene ID as index, transpose so rows are samples
features_df = expression_df.set_index('xena_sample').T
features_df.index.name = 'sample'

# Merge features and labels
merged_df = features_df.join(phenotype_labels, how='inner')
merged_df = merged_df.dropna(subset=['project.project_id'])

# Separate features (X) and labels (y)
y = merged_df['project.project_id']
X = merged_df.drop('project.project_id', axis=1)

# Feature selection: Top 1000 most variable genes
gene_variances = X.var()
top_1000_genes = gene_variances.sort_values(ascending=False).index[:1000]
X_top_1000 = X[top_1000_genes]

# Create stratified train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_top_1000, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Train a baseline Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=32)
model.fit(X_train_scaled, y_train)

# Evaluate the baseline model
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

print(f"\nBaseline Model Accuracy: {accuracy * 100:.2f}%")
print("\nBaseline Classification Report (Test Set):")
print(classification_report(y_test, y_pred, zero_division=0))

Baseline Model Accuracy: 91.95%

Baseline Classification Report (Test Set):
              precision    recall  f1-score   support

    TCGA-ACC       1.00      0.94      0.97        16
   TCGA-BLCA       0.95      0.92      0.93        86
   TCGA-BRCA       0.96      0.97      0.97       244
   TCGA-CESC       0.93      0.68      0.79        62
   TCGA-CHOL       0.75      0.33      0.46         9
   TCGA-COAD       0.74      1.00      0.85       102
   TCGA-DLBC       1.00      0.90      0.95        10
   TCGA-ESCA       0.73      0.54      0.62        35
    TCGA-GBM       1.00      0.97      0.99        35
   TCGA-HNSC       0.87      0.95      0.91       109
   TCGA-KICH       0.95      1.00      0.97        18
   TCGA-KIRC       0.93      0.93      0.93       122
   TCGA-KIRP       0.95      0.88      0.91        64
   TCGA-LAML       1.00      1.00      1.00        30
    TCGA-LGG       0.98      0.99      0.99       106
   TCGA-LIHC       0.92      0.98      0.95        85
   TCGA-LUAD       0.86      0.97      0.92       117
   TCGA-LUSC       0.96      0.79      0.87       110
   TCGA-MESO       1.00      0.82      0.90        17
     TCGA-OV       1.00      1.00      1.00        76
   TCGA-PAAD       0.89      0.86      0.87        36
   TCGA-PCPG       1.00      0.97      0.99        37
   TCGA-PRAD       1.00      1.00      1.00       110
   TCGA-READ       1.00      0.03      0.06        35
   TCGA-SARC       0.76      0.98      0.86        53
   TCGA-SKCM       0.96      0.95      0.95        94
   TCGA-STAD       0.82      0.91      0.87        81
   TCGA-TGCT       1.00      0.97      0.98        31
   TCGA-THCA       1.00      1.00      1.00       114
   TCGA-THYM       1.00      0.96      0.98        24
   TCGA-UCEC       0.85      0.99      0.92       117
    TCGA-UCS       1.00      0.18      0.31        11
    TCGA-UVM       1.00      0.94      0.97        16

    accuracy                           0.92      2212
   macro avg       0.93      0.86      0.87      2212
weighted avg       0.93      0.92      0.91      2212


# Generate and plot the confusion matrix
class_labels = model.classes_
cm = confusion_matrix(y_test, y_pred, labels=class_labels)

# Normalize by true label (rows) to show recall
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

plt.figure(figsize=(15, 12))
sns.heatmap(
    cm_normalized,
    xticklabels=class_labels,
    yticklabels=class_labels,
    cmap='Blues',
    annot=False,
    fmt='.2f'
)
plt.title('Normalized Confusion Matrix (Recall)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.tight_layout()


# Plot PCA (Principal Component Analysis)
pca = PCA(n_components=2)
X_test_pca = pca.fit_transform(X_test_scaled)
pca_df = pd.DataFrame(data=X_test_pca, columns=['PC1', 'PC2'])
pca_df['Cancer Type'] = y_test.values

plt.figure(figsize=(12, 8))
sns.scatterplot(
    data=pca_df,
    x='PC1', y='PC2',
    hue='Cancer Type',
    alpha=0.7, s=50,
    legend='full'
)
plt.title('PCA of TCGA Test Set')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., title='Cancer Type')
plt.tight_layout()
plt.show()


# Plot t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=30.0)
X_test_tsne = tsne.fit_transform(X_test_scaled)
tsne_df = pd.DataFrame(data=X_test_tsne, columns=['t-SNE 1', 't-SNE 2'])
tsne_df['Cancer Type'] = y_test.values

plt.figure(figsize=(12, 8))
sns.scatterplot(
    data=tsne_df,
    x='t-SNE 1', y='t-SNE 2',
    hue='Cancer Type',
    alpha=0.7, s=50,
    legend='full'
)
plt.title('t-SNE of TCGA Test Set')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., title='Cancer Type')
plt.tight_layout()
plt.show()


# Get feature importances from the baseline model
importances = model.feature_importances_
feature_imp_df = pd.DataFrame({
    'Gene': X_top_1000.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Get top 10 gene IDs (cleaned of version numbers)
top_10_ids = [gid.split('.')[0] for gid in feature_imp_df.head(10)['Gene']]

# Query Ensembl REST API to map IDs to symbols
print("Querying Ensembl API for gene symbols...")
server = "https://rest.ensembl.org"
ext = "/lookup/id"
headers = {"Content-Type": "application/json", "Accept": "application/json"}
data = json.dumps({"ids": top_10_ids})

r = requests.post(server + ext, headers=headers, data=data)
r.raise_for_status()
decoded = r.json()

gene_map = {}
for ensembl_id, info in decoded.items():
    symbol = info.get('display_name', ensembl_id) if info else ensembl_id
    gene_map[ensembl_id] = symbol

feature_imp_df['Gene Symbol'] = feature_imp_df['Gene'].apply(
    lambda x: gene_map.get(x.split('.')[0], x)
)

# Plot the Top 10 genes
plt.figure(figsize=(10, 6))
sns.barplot(
    data=feature_imp_df.head(10),
    x='Importance',
    y='Gene Symbol'
)
plt.title('Top 10 Most Important Genes for Classification')
plt.xlabel('Gini Importance')
plt.ylabel('Gene Symbol')
plt.tight_layout()
plt.savefig('feature_importance.png', dpi=300)
print("Feature importance plot saved.")


import scipy.cluster.hierarchy as sch
from sklearn.preprocessing import StandardScaler

print("--- Running Hierarchical Clustering ---")

# 'X_top_1000' is your RNA-seq feature matrix (from Cell 4) [cite: 51]
# 'y' is your series of cancer type labels (from Cell 4) [cite: 48]

# --- 1. Aggregate: Get the mean expression for each cancer type ---
# Combine features and labels
mean_df = X_top_1000.copy()
mean_df['cancer_type'] = y

# Group by cancer type and calculate the mean for all 1000 genes
mean_expression_df = mean_df.groupby('cancer_type').mean()

print(f"Aggregated data shape: {mean_expression_df.shape}") # Should be (33, 1000)

# --- 2. Scale the aggregated data ---
# We scale "across genes" (axis=0) to normalize each gene's profile
scaler_agg = StandardScaler()
mean_expression_scaled = scaler_agg.fit_transform(mean_expression_df.T).T # Transpose, scale, transpose back

# Convert back to a DataFrame
mean_expression_scaled_df = pd.DataFrame(
    mean_expression_scaled,
    index=mean_expression_df.index,
    columns=mean_expression_df.columns
)

# --- 3. Cluster and Plot the Dendrogram ---
# 'method='ward'' is a robust clustering method
linkage_matrix = sch.linkage(mean_expression_scaled_df, method='ward')

# Get the full names (from Cell 13) to use as labels
try:
    name_map_df = phenotype_df[['project.project_id', 'project.name']].drop_duplicates()
    name_map = pd.Series(name_map_df['project.name'].values, 
                         index=name_map_df['project.project_id']).to_dict()
    
    # Map the abbreviations in our index to the full names
    plot_labels = mean_expression_scaled_df.index.map(name_map).fillna(mean_expression_scaled_df.index)
except Exception:
    print("Could not create full name map, using abbreviations.")
    plot_labels = mean_expression_scaled_df.index

# Plot
plt.figure(figsize=(15, 10))
plt.title('Hierarchical Clustering of Cancer Types (by Mean Gene Expression)', fontsize=16)
plt.ylabel('Cluster Distance (Ward)', fontsize=12)
sch.dendrogram(
    linkage_matrix,
    labels=plot_labels.to_list(),
    leaf_rotation=90,
    leaf_font_size=10
)
plt.tight_layout()

--- Running Hierarchical Clustering ---
Aggregated data shape: (33, 1000)
Could not create full name map, using abbreviations.


# Get the top 100 gene symbols for pathway analysis
top_100_genes = feature_imp_df.head(100)['Gene Symbol'].tolist()
top_100_genes = [g for g in top_100_genes if not g.startswith('ENSG')] # Remove unmapped

print(f"Running pathway analysis on {len(top_100_genes)} mapped genes...")

enr = gp.enrichr(gene_list=top_100_genes,
                    gene_sets=['KEGG_2021_Human', 'GO_Biological_Process_2021'],
                    organism='Human',
                    outdir='enrichr_results',
                    cutoff=0.1)
gp.plot.barplot(enr.results,
                column='Adjusted P-value',
                title='Top Enriched Pathways in 100 Most Important Genes')

Running pathway analysis on 10 mapped genes...

<Axes: title={'center': 'Top Enriched Pathways in 100 Most Important Genes'}, xlabel='$- \\log_{10}$ (Adjusted P-value)'>


# Find full Ensembl IDs
gene_map_df = feature_imp_df.set_index('Gene Symbol')
ids_to_plot = [
    gene_map_df.loc['NKX2-1', 'Gene'], 
    gene_map_df.loc['SCGB2A2', 'Gene'], 
    gene_map_df.loc['HOXB13', 'Gene']
]

# Create plotting df
plot_df = X[ids_to_plot].copy()
plot_df['Cancer Type'] = y
plot_df = plot_df.melt(id_vars='Cancer Type', var_name='Gene', value_name='Expression (FPKM-UQ)')

# Map Ensembl IDs back to Symbols for plotting
symbol_map = {v: k for k, v in gene_map_df['Gene'].to_dict().items() if k in ['NKX2-1', 'SCGB2A2', 'HOXB13']}
plot_df['Gene Symbol'] = plot_df['Gene'].map(symbol_map)

# Create boxplots
g = sns.catplot(
    data=plot_df, 
    x='Cancer Type', 
    y='Expression (FPKM-UQ)', 
    col='Gene Symbol',
    kind='box',
    col_wrap=1,
    height=5, 
    aspect=3,
    sharex=False,
    sharey=False
)

g.set_xticklabels(rotation=90)
g.fig.suptitle('Expression of Top Lineage-Specific Genes Across Cancer Types', y=1.03)
plt.tight_layout()


# Encode labels for models that require it
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

best_models = {}
tuned_model_results = {}

# Logistic Regression
start_time = time.time()
lr = LogisticRegression(max_iter=2000, random_state=42, n_jobs=1)
param_grid_lr = {'C': [0.01, 0.05, 0.1, 0.5]}
grid_lr = GridSearchCV(lr, param_grid_lr, cv=3, n_jobs=32, scoring='accuracy')
grid_lr.fit(X_train_scaled, y_train_encoded)
best_models['Logistic Regression'] = grid_lr.best_estimator_
print(f"Finished in {time.time() - start_time:.2f}s. Best Score: {grid_lr.best_score_:.4f}")
print(f"Best Parameters: {grid_lr.best_params_}")


# K-Nearest Neighbors
start_time = time.time()
knn = KNeighborsClassifier()
param_grid_knn = {'n_neighbors': [3, 5, 7]}
grid_knn = GridSearchCV(knn, param_grid_knn, cv=3, n_jobs=32, scoring='accuracy')
grid_knn.fit(X_train_scaled, y_train_encoded)
best_models['K-Nearest Neighbors'] = grid_knn.best_estimator_
print(f"Finished in {time.time() - start_time:.2f}s. Best Score: {grid_knn.best_score_:.4f}")
print(f"Best Parameters: {grid_knn.best_params_}")

# Random Forest
start_time = time.time()
rf = RandomForestClassifier(random_state=42, n_jobs=1)
param_grid_rf = {'n_estimators': [100, 200], 'max_depth': [10, 20, None]}
grid_rf = GridSearchCV(rf, param_grid_rf, cv=3, n_jobs=32, scoring='accuracy')
grid_rf.fit(X_train_scaled, y_train_encoded)
best_models['Random Forest'] = grid_rf.best_estimator_
print(f"Finished in {time.time() - start_time:.2f}s. Best Score: {grid_rf.best_score_:.4f}")
print(f"Best Parameters: {grid_rf.best_params_}")

start_time = time.time()
warnings.filterwarnings("ignore", category=UserWarning, module='xgboost')
param_grid_xgb = {'learning_rate': [0.05, 0.1], 'max_depth': [5, 10], 'n_estimators': [100, 200]}
xgb = XGBClassifier(device="cuda", random_state=42, use_label_encoder=False, eval_metric='mlogloss')
grid_xgb = GridSearchCV(xgb, param_grid_xgb, cv=3, n_jobs=1, scoring='accuracy')
grid_xgb.fit(X_train_scaled, y_train_encoded)
best_models['XGBoost'] = grid_xgb.best_estimator_
print(f"Finished in {time.time() - start_time:.2f}s. Best Score: {grid_xgb.best_score_:.4f}")
print(f"Best Parameters: {grid_xgb.best_params_}")

# LinearSVC
start_time = time.time()
param_grid_lsvc = {'C': [0.01, 0.1, 1.0]}
lsvc = LinearSVC(max_iter=4000, random_state=42, dual='auto')
grid_lsvc = GridSearchCV(lsvc, param_grid_lsvc, cv=3, n_jobs=32, scoring='accuracy')
grid_lsvc.fit(X_train_scaled, y_train_encoded)
best_models['LinearSVC'] = grid_lsvc.best_estimator_
print(f"Finished in {time.time() - start_time:.2f}s. Best Score: {grid_lsvc.best_score_:.4f}")
print(f"Best Parameters: {grid_lsvc.best_params_}")

Finished in 29.27s. Best Score: 0.9480
Best Parameters: {'C': 0.1}
Finished in 1.92s. Best Score: 0.8996
Best Parameters: {'n_neighbors': 7}
Finished in 66.94s. Best Score: 0.9218
Best Parameters: {'max_depth': None, 'n_estimators': 200}
Finished in 189.01s. Best Score: 0.9386
Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
Finished in 1183.42s. Best Score: 0.9412
Best Parameters: {'C': 0.01}


for model_name, model in best_models.items():
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test_encoded, y_pred)
    tuned_model_results[model_name] = accuracy
    print(f"Tuned {model_name} Test Accuracy: {accuracy * 100:.2f}%")

# Visualize the final results
results_df = pd.DataFrame(list(tuned_model_results.items()), columns=['Model', 'Accuracy'])
results_df = results_df.sort_values(by='Accuracy', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=results_df, x='Accuracy', y='Model')
plt.title('Tuned Model Accuracy Benchmark (on Test Set)')
plt.xlabel('Accuracy Score')
plt.ylabel('')
plt.xlim(0.8, 1.0)
plt.tight_layout()

Tuned Logistic Regression Test Accuracy: 94.39%
Tuned K-Nearest Neighbors Test Accuracy: 90.55%
Tuned Random Forest Test Accuracy: 91.91%
Tuned XGBoost Test Accuracy: 93.58%
Tuned LinearSVC Test Accuracy: 93.31%


cnv_df = pd.read_csv('GDC-PANCAN.gistic.tsv', sep='\t', index_col=0)
cnv_df_transposed = cnv_df.T
cnv_df_transposed.index.name = 'sample'

# Feature selection: Top 1000 most variable CNV genes
print("Performing feature selection on CNV data...")
cnv_variances = cnv_df_transposed.var()
top_1000_cnv_genes = cnv_variances.sort_values(ascending=False).index[:1000]
X_top_1000_cnv = cnv_df_transposed[top_1000_cnv_genes]

# Add a suffix to CNV gene names to avoid collisions
X_top_1000_cnv = X_top_1000_cnv.add_suffix('_cnv')

Performing feature selection on CNV data...


print(f"Original RNA-seq shape: {X_top_1000.shape}")
print(f"New CNV shape: {X_top_1000_cnv.shape}")

# Combine the RNA-seq features and the new CNV features
X_multi_omic = pd.concat([X_top_1000, X_top_1000_cnv], axis=1, join='inner')
print(f"Combined multi-omic shape: {X_multi_omic.shape}")

# Re-align the labels (y) to match the samples in the new multi-omic dataframe
y_multi_omic = y.reindex(X_multi_omic.index)
y_multi_omic_encoded = le.transform(y_multi_omic)

# Re-split and re-scale the new dataset ---
X_mo_train, X_mo_test, y_mo_train, y_mo_test = train_test_split(
    X_multi_omic, y_multi_omic_encoded,
    test_size=0.2, 
    random_state=42, 
    stratify=y_multi_omic_encoded
)

# Create and fit new scaler
scaler_mo = StandardScaler()
X_mo_train_scaled = scaler_mo.fit_transform(X_mo_train)
X_mo_test_scaled = scaler_mo.transform(X_mo_test)

print(f"Multi-omic training data shape: {X_mo_train_scaled.shape}")
print(f"Multi-omic test data shape: {X_mo_test_scaled.shape}")

Original RNA-seq shape: (11057, 1000)
New CNV shape: (11368, 1000)
Combined multi-omic shape: (10205, 2000)
Multi-omic training data shape: (8164, 2000)
Multi-omic test data shape: (2041, 2000)


# Re-run benchmark on multi-omic data
multi_omic_results = {}
final_multi_omic_models = {}

# Logistic Regression
start_time = time.time()
lr_mo = LogisticRegression(C=0.1, max_iter=2000, random_state=42, n_jobs=1)
lr_mo.fit(X_mo_train_scaled, y_mo_train)
final_multi_omic_models['Logistic Regression'] = lr_mo
print(f"Finished in {time.time() - start_time:.2f}s")

# K-Nearest Neighbors 
start_time = time.time()
knn_mo = KNeighborsClassifier(n_neighbors=7, n_jobs=32)
knn_mo.fit(X_mo_train_scaled, y_mo_train)
final_multi_omic_models['K-Nearest Neighbors'] = knn_mo
print(f"Finished in {time.time() - start_time:.2f}s")

# Random Forest
start_time = time.time()
rf_mo = RandomForestClassifier(max_depth=None, n_estimators=200, random_state=42, n_jobs=32)
rf_mo.fit(X_mo_train_scaled, y_mo_train)
final_multi_omic_models['Random Forest'] = rf_mo
print(f"Finished in {time.time() - start_time:.2f}s")

# XGBoost
start_time = time.time()
warnings.filterwarnings("ignore", category=UserWarning, module='xgboost')
xgb_mo = XGBClassifier(device="cuda", learning_rate=0.1, max_depth=5, n_estimators=200,
                       random_state=42, use_label_encoder=False, eval_metric='mlogloss')
xgb_mo.fit(X_mo_train_scaled, y_mo_train)
final_multi_omic_models['XGBoost'] = xgb_mo
print(f"Finished in {time.time() - start_time:.2f}s")

# LinearSVC
start_time = time.time()
lsvc_mo = LinearSVC(C=0.01, max_iter=4000, random_state=42, dual='auto')
lsvc_mo.fit(X_mo_train_scaled, y_mo_train)
final_multi_omic_models['LinearSVC'] = lsvc_mo
print(f"Finished in {time.time() - start_time:.2f}s")

# Evaluation
for model_name, model in final_multi_omic_models.items():
    y_mo_pred = model.predict(X_mo_test_scaled)
    accuracy_mo = accuracy_score(y_mo_test, y_mo_pred)
    multi_omic_results[model_name] = accuracy_mo
    print(f"Tuned {model_name} Test Accuracy: {accuracy_mo * 100:.2f}%")

Finished in 37.98s
Finished in 0.14s
Finished in 1.62s
Finished in 12.86s
Finished in 149.47s
Tuned Logistic Regression Test Accuracy: 95.30%
Tuned K-Nearest Neighbors Test Accuracy: 78.59%
Tuned Random Forest Test Accuracy: 93.68%
Tuned XGBoost Test Accuracy: 95.25%
Tuned LinearSVC Test Accuracy: 94.76%


# Rescale y-values for plotting
baseline = 0.85
final_comparison_df['Height'] = final_comparison_df['Accuracy'] - baseline

models = final_comparison_df['Model'].unique()
data_types = final_comparison_df['Data Type'].unique()
x = np.arange(len(models))
width = 0.35

plt.figure(figsize=(12, 7))
ax = plt.subplot(111)

heights1 = final_comparison_df[final_comparison_df['Data Type'] == 'RNA-seq Only']['Height']
heights2 = final_comparison_df[final_comparison_df['Data Type'] == 'RNA-seq + CNV']['Height']
rects1 = ax.bar(x - width/2, heights1, width, label='RNA-seq Only', bottom=baseline)
rects2 = ax.bar(x + width/2, heights2, width, label='RNA-seq + CNV', bottom=baseline)

ax.axhline(y=baseline, color='black', linestyle='--')
ax.set_ylabel('Test Set Accuracy')
ax.set_title('Model Performance')
ax.set_xticks(x)
ax.set_xticklabels(models)
ax.legend(title='Data Type')
ax.set_ylim(0.78,0.97) 

plt.tight_layout()

Pan-Cancer Classification Using Multi-Omic Synergy¶

Project Motivation¶

Imports and Data Collection¶

Part 1: Baseline Model with RNA-seq Data¶

Train Baseline RandomForest¶

Visualizing the Feature Space (PCA vs. t-SNE)¶

Baseline Feature Importance¶

Pathway Analysis (GSEA)¶

Analysis of Lineage-Specific Gene Expression¶

Part 2: Benchmarking RNA-seq Models¶

Plot RNA-seq Benchmark Results¶

Part 3: Testing the Multi-Omic Synergy Hypothesis¶

Conclusion and Findings¶