๐Ÿงฌ ์œ ์ „์ž ๋ฐœํ˜„ ๋ฐ์ดํ„ฐ ์‹œ๊ฐํ™” ๋ฐ ๋ถ„์„ (GSE96058)

์žฅ์ฑ„๋ฏผยท2025๋…„ 7์›” 31์ผ

๐Ÿ“ฆ ์‚ฌ์šฉํ•œ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

๐Ÿ“‚ 1. ์‹œ๋ฆฌ์ฆˆ ๋งคํŠธ๋ฆญ์Šค ํŒŒ์ผ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ

file_path='GSE96058-GPL11154_series_matrix.txt'

try:
    #๋ฐ์ดํ„ฐ ์‹œ์ž‘ ์ง€์  ์ฐพ๊ธฐ
    with open(file_path,'r') as f:
        lines=f.readlines()
        
    data_start_line=0
    for i,line in enumerate(lines):
        if line.startswith('"ID_REF'):
            data_start_line=i
            print(data_start_line)
            break
    if data_start_line==0:
        print("๋ฐ์ดํ„ฐ ์‹œ์ž‘ ํ–‰์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
        
    df_raw=pd.read_csv(file_path, sep='\t', skiprows=data_start_line, index_col=0)
        
except FileNotFoundError:
    print(f"์˜ค๋ฅ˜: ํŒŒ์ผ ๊ฒฝ๋กœ '{file_path}'๋ฅผ ํ™•์ธํ•ด์ฃผ์„ธ์š”.")
except Exception as e:
    print(f"๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜: {e}")

๐Ÿ“ˆ 2. ์œ ์ „์ž ๋ฐœํ˜„ ๋ฐ์ดํ„ฐ ๊ฐ„๋‹จ ์‹œ๊ฐํ™” (ํžˆํŠธ๋งต)

# ํŒŒ์ผ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ
df=pd.read_csv('GSE96058_gene_expression_3273_samples_and_136_replicates_transformed.csv.gz')

# ์ฒซ๋ฒˆ์งธ ์—ด์ธ ์œ ์ „์ž ์ด๋ฆ„์„ ์ธ๋ฑ์Šค๋กœ ์„ค์ •
df.set_index(df.columns[0], inplace=True)

# ์ˆซ์žํ˜•์œผ๋กœ ๋ณ€ํ™˜ + ๊ฒฐ์ธก๊ฐ’ ์ฒ˜๋ฆฌ (์•ˆ ํ•˜๋ฉด ํžˆํŠธ๋งต ์˜ค๋ฅ˜ ๋‚จ)
df=df.apply(pd.to_numeric, errors='coerce')
df.fillna(0,inplace=True)

# ์ผ๋ถ€ ์œ ์ „์ž ์ƒ˜ํ”Œ๋งŒ ์ถ”์ถœ (์ƒ์œ„ 50๊ฐœ ์œ ์ „์ž์˜ 50๊ฐœ ์ƒ˜ํ”Œ)
df_subset=df.iloc[:50,:50]

# ํžˆํŠธ๋งต ๊ทธ๋ฆฌ๊ธฐ
plt.figure(figsize=(18,10))
sns.heatmap(df_subset, cmap='viridis', cbar=True)
plt.title("Gene Expression Heatmap (ToP 50 genes x 50 samples)")
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

๐Ÿงญ 3. PCA (์ฃผ์„ฑ๋ถ„ ๋ถ„์„)

์ƒ˜ํ”Œ๋“ค์„ 2์ฐจ์› ๊ณต๊ฐ„์—์„œ ์‹œ๊ฐํ™”

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# ์ „์น˜: ํ–‰=์ƒ˜ํ”Œ, ์—ด=์œ ์ „์ž
df_T=df.T

# ํ‘œ์ค€ํ™”(๊ฐ ์œ ์ „์ž์˜ ํ‰๊ท =0, ํ‘œ์ค€ํŽธ์ฐจ=1๋กœ ๋งž์ถ”๊ธฐ)
scaler=StandardScaler()
df_scaled=scaler.fit_transform(df_T)

# PCA(์‚ฌ์ดํ‚ท๋Ÿฐ ์ฐจ์›์ถ•์†Œ)
pca=PCA(n_components=2)   # ๋ฐ์ดํ„ฐ์˜ ์ฃผ์š” ํŠน์ง•์„ ์ž˜ ์„ค๋ช…ํ•˜๋Š” ๋‘ ์ถ•(์ฃผ์„ฑ๋ถ„)๋งŒ ๋‚จ๊น€/2์ฐจ์›
pca_result=pca.fit_transform(df_scaled)

# ์‹œ๊ฐํ™”
plt.figure(figsize=(10,6))
plt.scatter(pca_result[:,0],pca_result[:,1], alpha=0.6, cmap='viridis')  # ์‚ฐ์ ๋„(x์ถ• ์ฃผ์„ฑ๋ถ„, y์ถ• ์ฃผ์„ฑ๋ถ„, ์ ์˜ ํˆฌ๋ช…๋„ ์กฐ์ ˆ)
plt.title('PCA of Samples (based on gene expression)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.grid(True)
plt.show()

๐Ÿงฌ 4. Clustermap

์ƒ˜ํ”Œ 100๊ฐœ๋งŒ ๋ฝ‘์•„์„œ ํด๋Ÿฌ์Šคํ„ฐ๋ง๋œ ํžˆํŠธ๋งต ๊ทธ๋ฆฌ๊ธฐ

# ์œ ์ „์ž ์ˆ˜๊ฐ€ ๋„ˆ๋ฌด ๋งŽ์œผ๋‹ˆ 100๊ฐœ ์ •๋„ ์ƒ˜ํ”Œ๋ง
df_subset2=df.sample(n=100,axis=0) 

sns.clustermap(df_subset2, cmap="viridis", figsize=(14,10))
plt.title("Sample-Gene Clustermap")
plt.show()

๐ŸŽฏ 5. ์œ ๋ฐฉ์•” ์ฃผ์š” ์œ ์ „์ž ๋ฐœํ˜„ ํ™•์ธ (๋ฐ•์Šคํ”Œ๋กฏ)

# ์œ ๋ฐฉ์•” ์ฃผ์š” ์œ ์ „์ž ๋ถ„์„
genes=['BRCA1', 'TP53', 'ESR1', 'ERBB2', 'PGR', 'GATA3']
df_target_genes=df.loc[df.index.intersection(genes)]  # df์˜ ํ–‰ ์ธ๋ฑ์Šค(์œ ์ „์ž ์ด๋ฆ„) ์ค‘์—์„œ genes์— ํฌํ•จ๋œ ๊ฒƒ๋งŒ ์ถ”๋ฆผ

df_target_genes.T.plot(kind='box', figsize=(10,6))  # ๋ฐ•์Šค ํ”Œ๋กฏ์œผ๋กœ ๊ทธ๋ฆฌ๊ธฐ
plt.title("Expression distribution of breast cancer-related genes") 
plt.ylabel("Expression level")
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

๐Ÿงช 6. BRCA1 ๋ฐœํ˜„ ๋†’์€ ์ƒ˜ํ”Œ vs ๋‚ฎ์€ ์ƒ˜ํ”Œ ๋น„๊ต

li=["BRCA1"]
df_BRCA1 =df.loc[df.index.intersection(li)]
df_BRCA1=df_BRCA1.apply(pd.to_numeric, errors='coerce').fillna(0)


med_value=df_BRCA1.loc['BRCA1'].median()

print(med_value)

low_samples=df_BRCA1.loc[:, df_BRCA1.loc['BRCA1'] < med_value]
high_samples=df_BRCA1.loc[:, df_BRCA1.loc['BRCA1'] >= med_value]

low_samples_50=low_samples.sample(n=50, axis=1, random_state=42)
high_samples_50=high_samples.sample(n=50, axis=1, random_state=42)

combined=pd.concat([low_samples_50, high_samples_50], axis=1)

print(combined.size)

plt.figure(figsize=(18,10))
sns.heatmap(combined, cmap='viridis', cbar=True)
plt.title("BRCA1 Expression: High vs Low Samples")
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.show()

0๊ฐœ์˜ ๋Œ“๊ธ€