pandas code

MK·2023년 5월 17일
0

import

import pandas as pd
import numpy as np
import math

from sklearn.preprocessing import StandardScaler

from scipy.stats import pearsonr

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

KMeans

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

for k in range(2, 10):
    model = KMeans(n_clusters= k, random_state=1)
    model.fit(X = df.drop(columns='X1'))
    score= silhouette_score(X=df.drop(columns='X1'), labels=model.labels_)
    
# predict3_4 = KMeans(random_state=1234, n_clusters=4, n_init = 50, max_iter = 300).fit(train3).predict(train3).tolist()

# predict(train_set) == labels_

Data Split

from sklearn.model_selection import train_test_split
tr, tet = train_test_split(q3, test_size=0.01, random_state=229)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

Linear Regression

model = LinearRegression(fit_intercept=True)
model.fit(X=df.drop(columns=['apt_code']),  y = df['Y'])

pred = model.predict(X = test_df.drop(columns=['apt_code', 'Y']))

KNN

from sklearn.model_selection import train_test_split  
from sklearn.neighbors import KNeighborsClassifier    

neigh = KNeighborsClassifier(p=1, weights='distance')
neigh.fit(tr2[['X2']], tr2[['y']])

Scaler

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() # MinMaxScaler()
scaler.fit(X = tr.loc[:, 'X1':'X2'])
train_set.loc[:, 'X1':'X2'] = scaler.transform(X = train_set.loc[:, 'X1':'X2'])

# scaler.fit_transform(X = train_set.loc[:, 'X1':'X2'])

# var3 = ['GRE', 'TOEFL', 'SOP', 'LOR', 'CGPA']
# pd.DataFrame(MinMaxScaler().fit_transform(df3_d[var3]), columns=var3)

KFold

from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
for train, test in kf.split(d):
    print(test, train)
[0 1] [2 3 ..]
[2 3] [0 1 4 5 ..] 

Corr/Cov

df.corr(method='pearson')
df.Col1.corr(df.Col2)

df.cov()

Math

Random Forest

from sklearn.ensemble import RandomForestRegressor
rf_a = RandomForestRegressor(random_state=1234, n_estimators = 10, min_samples_leaf = 10)

train_a_y = df6_a['SCORE']
train_a_x = df6_a.drop(columns=['EMP_ID','SCORE'])

rf_a.fit(train_a_x, train_a_y)
a_fi = rf_a.feature_importances_

rank

df[['GRE']].rank(method='min', ascending=False)

apply & drop

df2['outlier']=df2['TOEFL'].apply(lambda x : 1 if (x<q1-1.5*iqr)|(x>q3+1.5*iqr) else 0)
df2=df2[df2['outlier']==0].drop('outlier',axis=1)

reset_index

pd.concat([df3_d, df3['RESEARCH'].reset_index(drop=True)])

value_counts & find max

# list X
# DataFrame, Series O

# for list
pd.DataFrame(list).value_counts()
list.count(max(list, key=list.count))

pd.DataFrame(predict3_5).value_counts().max()
pd.DataFrame(predict3_5).value_counts().idxmax()[0]

max(predict3_5, key=predict3_5.count)

max('apple', 'Pear', key=lambda x: x.upper())
"ABCAB".count("A", 1)

max (collection[, key])
>>> max('apple', 'Pear', key=lambda x: x.upper())
'Pear'
>>> max('apple', 'Pear')
'apple'

accuray

from sklearn.metrics import accuracy_score
A22 = accuracy_score(y_train,predict2)*100

get_dummies

dummies3 = pd.get_dummies(df3['UNIV_RATING'], drop_first=True).reset_index(drop=True)

range 설정

# Step 2-3
tmp = 0
res = []
for i in range(6):
    lb = tmp
    ub = tmp+200
    test = df2_1[(df2_1['CUST_ID'] > lb) & (df2_1['CUST_ID'] <= ub)]
    train = df2_1[((df2_1['CUST_ID'] > 0) & (df2_1['CUST_ID'] <=lb)) | ((df2_1['CUST_ID'] > ub) & (df2_1['CUST_ID'] <= 1200))]    
    train_x = train.drop(columns = ['CUST_ID', 'SATISFACTION'])
    train_y = train[['SATISFACTION']]
    
    test_x = test.drop(columns = ['CUST_ID', 'SATISFACTION'])
    test_y = test[['SATISFACTION']]
    
    lr = LogisticRegression(solver = 'newton-cg', C=100000, random_state=1234)
    lr.fit(train_x, train_y)
#     print(test['CUST_ID'].min(), test['CUST_ID'].max())
    test_y['prd'] = lr.predict(test_x)
    res.append(accuracy_score(test_y['SATISFACTION'], test_y['prd']))
    tmp = tmp + 200
D = sum(res)/6

0개의 댓글