import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
file_path='GSE96058-GPL11154_series_matrix.txt'
try:
#๋ฐ์ดํฐ ์์ ์ง์ ์ฐพ๊ธฐ
with open(file_path,'r') as f:
lines=f.readlines()
data_start_line=0
for i,line in enumerate(lines):
if line.startswith('"ID_REF'):
data_start_line=i
print(data_start_line)
break
if data_start_line==0:
print("๋ฐ์ดํฐ ์์ ํ์ ์ฐพ์ ์ ์์ต๋๋ค.")
df_raw=pd.read_csv(file_path, sep='\t', skiprows=data_start_line, index_col=0)
except FileNotFoundError:
print(f"์ค๋ฅ: ํ์ผ ๊ฒฝ๋ก '{file_path}'๋ฅผ ํ์ธํด์ฃผ์ธ์.")
except Exception as e:
print(f"๋ฐ์ดํฐ ์ฒ๋ฆฌ ์ค ์ค๋ฅ: {e}")
# ํ์ผ ๋ถ๋ฌ์ค๊ธฐ
df=pd.read_csv('GSE96058_gene_expression_3273_samples_and_136_replicates_transformed.csv.gz')
# ์ฒซ๋ฒ์งธ ์ด์ธ ์ ์ ์ ์ด๋ฆ์ ์ธ๋ฑ์ค๋ก ์ค์
df.set_index(df.columns[0], inplace=True)
# ์ซ์ํ์ผ๋ก ๋ณํ + ๊ฒฐ์ธก๊ฐ ์ฒ๋ฆฌ (์ ํ๋ฉด ํํธ๋งต ์ค๋ฅ ๋จ)
df=df.apply(pd.to_numeric, errors='coerce')
df.fillna(0,inplace=True)
# ์ผ๋ถ ์ ์ ์ ์ํ๋ง ์ถ์ถ (์์ 50๊ฐ ์ ์ ์์ 50๊ฐ ์ํ)
df_subset=df.iloc[:50,:50]
# ํํธ๋งต ๊ทธ๋ฆฌ๊ธฐ
plt.figure(figsize=(18,10))
sns.heatmap(df_subset, cmap='viridis', cbar=True)
plt.title("Gene Expression Heatmap (ToP 50 genes x 50 samples)")
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()
์ํ๋ค์ 2์ฐจ์ ๊ณต๊ฐ์์ ์๊ฐํ
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
# ์ ์น: ํ=์ํ, ์ด=์ ์ ์
df_T=df.T
# ํ์คํ(๊ฐ ์ ์ ์์ ํ๊ท =0, ํ์คํธ์ฐจ=1๋ก ๋ง์ถ๊ธฐ)
scaler=StandardScaler()
df_scaled=scaler.fit_transform(df_T)
# PCA(์ฌ์ดํท๋ฐ ์ฐจ์์ถ์)
pca=PCA(n_components=2) # ๋ฐ์ดํฐ์ ์ฃผ์ ํน์ง์ ์ ์ค๋ช
ํ๋ ๋ ์ถ(์ฃผ์ฑ๋ถ)๋ง ๋จ๊น/2์ฐจ์
pca_result=pca.fit_transform(df_scaled)
# ์๊ฐํ
plt.figure(figsize=(10,6))
plt.scatter(pca_result[:,0],pca_result[:,1], alpha=0.6, cmap='viridis') # ์ฐ์ ๋(x์ถ ์ฃผ์ฑ๋ถ, y์ถ ์ฃผ์ฑ๋ถ, ์ ์ ํฌ๋ช
๋ ์กฐ์ )
plt.title('PCA of Samples (based on gene expression)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.grid(True)
plt.show()
์ํ 100๊ฐ๋ง ๋ฝ์์ ํด๋ฌ์คํฐ๋ง๋ ํํธ๋งต ๊ทธ๋ฆฌ๊ธฐ
# ์ ์ ์ ์๊ฐ ๋๋ฌด ๋ง์ผ๋ 100๊ฐ ์ ๋ ์ํ๋ง
df_subset2=df.sample(n=100,axis=0)
sns.clustermap(df_subset2, cmap="viridis", figsize=(14,10))
plt.title("Sample-Gene Clustermap")
plt.show()
# ์ ๋ฐฉ์ ์ฃผ์ ์ ์ ์ ๋ถ์
genes=['BRCA1', 'TP53', 'ESR1', 'ERBB2', 'PGR', 'GATA3']
df_target_genes=df.loc[df.index.intersection(genes)] # df์ ํ ์ธ๋ฑ์ค(์ ์ ์ ์ด๋ฆ) ์ค์์ genes์ ํฌํจ๋ ๊ฒ๋ง ์ถ๋ฆผ
df_target_genes.T.plot(kind='box', figsize=(10,6)) # ๋ฐ์ค ํ๋กฏ์ผ๋ก ๊ทธ๋ฆฌ๊ธฐ
plt.title("Expression distribution of breast cancer-related genes")
plt.ylabel("Expression level")
plt.xticks(rotation=45)
plt.grid(True)
plt.show()
li=["BRCA1"]
df_BRCA1 =df.loc[df.index.intersection(li)]
df_BRCA1=df_BRCA1.apply(pd.to_numeric, errors='coerce').fillna(0)
med_value=df_BRCA1.loc['BRCA1'].median()
print(med_value)
low_samples=df_BRCA1.loc[:, df_BRCA1.loc['BRCA1'] < med_value]
high_samples=df_BRCA1.loc[:, df_BRCA1.loc['BRCA1'] >= med_value]
low_samples_50=low_samples.sample(n=50, axis=1, random_state=42)
high_samples_50=high_samples.sample(n=50, axis=1, random_state=42)
combined=pd.concat([low_samples_50, high_samples_50], axis=1)
print(combined.size)
plt.figure(figsize=(18,10))
sns.heatmap(combined, cmap='viridis', cbar=True)
plt.title("BRCA1 Expression: High vs Low Samples")
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.show()
