pycaret baseline - regression

Loxia·2023년 4월 14일
0
from pycaret.regression import *
# initialize
s = setup(data=train_data, target='IXX', train_size=0.8, test_data=None,
          ordinal_features=ordinal, categorical_features=categorical,
          ignore_features=None, keep_features=None,
          n_jobs=-1, use_gpu=False, session_id=69,

          data_split_shuffle=True, data_split_stratify=False,
          fold_strategy='kfold', fold=4, fold_shuffle=True,

          preprocess=True,
          # imputations
          imputation_type=None,  # 'simple', 'iterative', None
          numeric_imputation='mean',  # 'drop', 'mean', 'median', 'mode', 'knn', int or float
          categorical_imputation='mode',  # 'drop', 'mode', str
          iterative_imputation_iters=5,  # only when iterative

          # categorical data labeling          
          max_encoding_ohe=25, encoding_method=None, # If None, category_encoders.leave_one_out.LeaveOneOutEncoder is used.
          rare_to_value=None, rare_value='rare',

          # feature transformation
          normalize=True, normalize_method='zscore',  # 'zscore', 'minmax'          
          remove_outliers=False, outliers_method='iforest', outliers_threshold=0.05, # 'iforest', 'ee', 'lof'
          remove_multicollinearity=False, multicollinearity_threshold=0.9,
          polynomial_features=False, polynomial_degree=2,

          transformation=False, transformation_method='yeo-johnson',  # 'quantile', 'yeo-johnson'          
          pca=False, pca_method='linear', pca_components=None, # 'linear', 'kernel', 'incremental'         
          feature_selection=False, feature_selection_method='classic', # 'classic', 'univariate', 'sequential'
          n_features_to_select=0.9,

          bin_numeric_features=None,          
          low_variance_threshold=None, # 0: keep only none zero variance features, None: skip , float: apply

          numeric_iterative_imputer='lightgbm',
          categorical_iterative_imputer='lightgbm',
          feature_selection_estimator='lightgbm'
          )
#apply custom metric
metric_remove=['mae','mse','r2','rmsle','mape','rmse']
for metric in metric_remove:
    s.remove_metric(metric)
s.add_metric(id='nrmse',name='NRMSE',score_func=calc_normalized_rmse,greater_is_better=False)
s.add_metric(id='maep',name='MAE_P',score_func=calc_mae_percentage,greater_is_better=False)
s.add_metric(id='maxerr',name='MAX_E',score_func=calc_max_error,greater_is_better=False)
# fit and evaluate
best=compare_models(exclude=['lr','lar'],sort='nrmse') #fitting
# fit and predict
best=compare_models(sort='nrmse') #fitting
y_pred=s.predict_model(best,data=test) #predict
y_pred['prediction_label']
# automated EDA
eda(display_format='svg')
# automated analyze
evaluate_model(best)
interpret_model(best)
dashboard(best)
deep_check(best)
# automated hyper-parameter tunning
tune=tune_model(best,optimize='nrmse',choose_better = True, n_iter=50,
                   search_library='optuna') # 'optuna', 'tune-sklearn', 'scikit-optimize'
# tuning+ensemble+blend+stack pipeline example
top3 = compare_models(sort='nrmse',n_select = 3) # compare models
tuned_top3 = [tune_model(i,optimize='nrmse', search_library='optuna', choose_better=True) for i in top3] # tune top 3 models
boosting_top3 = [ensemble_model(i,optimize='nrmse', method='Boosting',n_estimators=100,choose_better=True) for i in tuned_top3] # ensemble top 3 tuned models
bagging_top3 = [ensemble_model(i,optimize='nrmse', method='Bagging',n_estimators=100,choose_better=True) for i in tuned_top3] # ensemble top 3 tuned models
blender = blend_models(bagging_top3,optimize='nrmse',choose_better=True, weights = [0.5,0.5,0.5]) # blender
stacker = stack_models(tuned_top3) # stacker
lb = get_leaderboard() # check leaderboard
lb.iloc[0]['Model'] # select top model
profile
50mm F2.0

0개의 댓글