import pandas as pd
import matplotlib.colors as colors
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn.preprocessing import LabelEncoder
from collections import Counter
pd.set_option('display.max_rows', 30)
import numpy as np
import seaborn as sns
from sklearn.metrics import accuracy_score


df = pd.read_csv('facies_vectors.csv')


df.tail()


# specify some data types may python concern about
df['Facies'] = df['Facies'].astype('int')
df['Depth'] = df['Depth'].astype('float')
df['Well Name'] = df['Well Name'].astype('category')


# colors 
facies_colors = ['xkcd:goldenrod', 'xkcd:orange','xkcd:sienna','xkcd:violet',
       'xkcd:olive','xkcd:turquoise', "xkcd:yellowgreen", 'xkcd:indigo', 'xkcd:blue']

facies_labels = ['SS', 'CSiS', 'FSiS', 'SiSh', 
                 'MS',  'WS', 'D','PS', 'BS']
#facies_color_map is a dictionary that maps facies labels to their respective colors
facies_color_map = {}
for ind, label in enumerate(facies_labels):
    facies_color_map[label] = facies_colors[ind]

def label_facies(row, labels):
    return labels[ row['Facies'] -1]
#establish facies label str    
df.loc[:,'FaciesLabels'] = df.apply(lambda row: label_facies(row, facies_labels), axis=1)


data=df


def make_facies_log_plot(logs, facies_colors):
    #make sure logs are sorted by depth
    logs = logs.sort_values(by='Depth')
    cmap_facies = colors.ListedColormap(
            facies_colors[0:len(facies_colors)], 'indexed')
    
    ztop=logs.Depth.min(); zbot=logs.Depth.max()
    
    cluster=np.repeat(np.expand_dims(logs['Facies'].values,1), 100, 1)
    
    f, ax = plt.subplots(nrows=1, ncols=6, figsize=(12, 6))
    ax[0].plot(logs.GR, logs.Depth, '-g',  alpha=0.8, lw = 0.9)
    ax[1].plot(logs.ILD_log10, logs.Depth, '-b',  alpha=0.8, lw = 0.9)
    ax[2].plot(logs.DeltaPHI, logs.Depth, '-k',  alpha=0.8, lw = 0.9)
    ax[3].plot(logs.PHIND, logs.Depth, '-r',  alpha=0.8, lw = 0.9)
    ax[4].plot(logs.PE, logs.Depth, '-c',  alpha=0.8, lw = 0.9)
    im=ax[5].imshow(cluster, interpolation='none', aspect='auto',
                    cmap=cmap_facies,vmin=1,vmax=9)
    
    divider = make_axes_locatable(ax[5])
    cax = divider.append_axes("right", size="20%", pad=0.05)
    cbar=plt.colorbar(im, cax=cax)
    cbar.set_label((5*' ').join([' SS ', 'CSiS', 'FSiS', 
                                'SiSh', ' MS ', ' WS ', ' D  ', 
                                ' PS ', ' BS ']))
    cbar.set_ticks(range(0,1)); cbar.set_ticklabels('')
    
    for i in range(len(ax)-1):
        ax[i].set_ylim(ztop,zbot)
        ax[i].invert_yaxis()
        ax[i].grid()
        ax[i].locator_params(axis='x', nbins=3)
    
    ax[0].set_xlabel("GR")
    ax[0].set_xlim(logs.GR.min(),logs.GR.max())
    ax[1].set_xlabel("ILD_log10")
    ax[1].set_xlim(logs.ILD_log10.min(),logs.ILD_log10.max())
    ax[2].set_xlabel("DeltaPHI")
    ax[2].set_xlim(logs.DeltaPHI.min(),logs.DeltaPHI.max())
    ax[3].set_xlabel("PHIND")
    ax[3].set_xlim(logs.PHIND.min(),logs.PHIND.max())
    ax[4].set_xlabel("PE")
    ax[4].set_xlim(logs.PE.min(),logs.PE.max())
    ax[5].set_xlabel('Facies')
    
    ax[1].set_yticklabels([]); ax[2].set_yticklabels([]); ax[3].set_yticklabels([])
    ax[4].set_yticklabels([]); ax[5].set_yticklabels([])
    ax[5].set_xticklabels([])
    f.suptitle('Well: %s'%logs.iloc[0]['Well Name'], fontsize=14,y=0.94)


data['Well Name'].unique()

['SHRIMPLIN', 'ALEXANDER D', 'SHANKLE', 'LUKE G U', 'KIMZEY A', 'CROSS H CATTLE', 'NOLAN', 'Recruit F9', 'NEWBY', 'CHURCHMAN BIBLE']
Categories (10, object): ['SHRIMPLIN', 'ALEXANDER D', 'SHANKLE', 'LUKE G U', ..., 'NOLAN', 'Recruit F9', 'NEWBY', 'CHURCHMAN BIBLE']


make_facies_log_plot(
    data[data['Well Name'] == 'SHRIMPLIN'],
    facies_colors)
# plt.savefig("Well_example.png", dpi=400)


cn = Counter(data.FaciesLabels)
for i,j in cn.items():
    percent = j / len(data) * 100
    print('Class=%s, Count=%d, Percentage=%.3f%%' % (i, j, percent))

Class=FSiS, Count=780, Percentage=18.800%
Class=CSiS, Count=940, Percentage=22.656%
Class=PS, Count=686, Percentage=16.534%
Class=WS, Count=582, Percentage=14.027%
Class=D, Count=141, Percentage=3.398%
Class=SiSh, Count=271, Percentage=6.532%
Class=MS, Count=296, Percentage=7.134%
Class=BS, Count=185, Percentage=4.459%
Class=SS, Count=268, Percentage=6.459%


plt.bar(cn.keys(), cn.values(), color=facies_colors )
plt.title('Facies Distribution')
plt.ylabel('Frequency')
# plt.savefig("bar_plot.png", dpi=400)

Text(0, 0.5, 'Frequency')


sns_plot = sns.pairplot(data.drop(['Well Name','Facies','Formation','Depth','NM_M','RELPOS'],axis=1),
             hue='FaciesLabels', palette=facies_color_map,
             hue_order=list(reversed(facies_labels)))
sns_plot.savefig('cross_plots.png')


data_fe = data


data_fe.isna().sum()

Facies            0
Formation         0
Well Name         0
Depth             0
GR                0
ILD_log10         0
DeltaPHI          0
PHIND             0
PE              917
NM_M              0
RELPOS            0
FaciesLabels      0
dtype: int64


# to find out which wells do not have PE
df_null = data_fe.loc[data_fe.PE.isna()]
df_null['Well Name'].unique()

['ALEXANDER D', 'KIMZEY A', 'Recruit F9']
Categories (3, object): ['ALEXANDER D', 'KIMZEY A', 'Recruit F9']


data_fe.corr().style.background_gradient(cmap='coolwarm').set_precision(2)


from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model


set_PE = data_fe[['Facies','Depth', 'GR', 'ILD_log10',
       'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']].dropna()  # select features and target log that has value
X = set_PE[['Facies','Depth', 'GR', 'ILD_log10',
       'DeltaPHI', 'PHIND', 'NM_M', 'RELPOS']]  # feature selection without null value
XX = data_fe[['Facies','Depth', 'GR', 'ILD_log10',
       'DeltaPHI', 'PHIND', 'NM_M', 'RELPOS']]
y = set_PE['PE'] # target log
# scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_b = scaler.fit_transform(XX)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


%%time
MLP_pe = MLPRegressor(random_state=1, max_iter= 500).fit(X_train, y_train) #fit the model
MLP_pe.score(X_test, y_test) # examine accuracy

Wall time: 3.1 s

0.776094801295609


data_fe['PE_pred'] = MLP_pe.predict(X_b)  # predict PE
data_fe.PE.fillna(data_fe.PE_pred, inplace =True) # fill NaN vakues with predicted PE


make_facies_log_plot(
    data[data['Well Name'] == 'ALEXANDER D'],
    facies_colors)
plt.savefig("predicted_PE.png", dpi=400)


# remove predicted PE column
data_fe = data_fe.drop(columns=['PE_pred'])
data = data.drop(columns=['PE_pred'])


data_fe['Formation_num'] = LabelEncoder().fit_transform(data_fe['Formation'].astype('str')) + 1


f_num = pd.get_dummies(data_fe['Formation'], prefix='fm')
data_fe = pd.concat([data_fe,f_num ], axis=1, join="inner" )


pd.set_option('display.max_columns', None)
data_fe.head()


data_fe['Well Name'].unique()

['SHRIMPLIN', 'ALEXANDER D', 'SHANKLE', 'LUKE G U', 'KIMZEY A', 'CROSS H CATTLE', 'NOLAN', 'Recruit F9', 'NEWBY', 'CHURCHMAN BIBLE']
Categories (10, object): ['SHRIMPLIN', 'ALEXANDER D', 'SHANKLE', 'LUKE G U', ..., 'NOLAN', 'Recruit F9', 'NEWBY', 'CHURCHMAN BIBLE']


blind = data_fe[data_fe['Well Name'] == 'KIMZEY A']
data_fe = data_fe[data_fe['Well Name'] != 'KIMZEY A']


y = data_fe.pop('Facies')
X = data_fe.drop(columns=['Formation', 'Well Name', 'FaciesLabels'])


from numpy import mean
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression


model = LogisticRegression(solver='liblinear')
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print('Accuracy: %.3f' % (mean(scores)))

Accuracy: 0.593


%%time
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA
#-------------------------------------------------- append transforms into a list
transforms = list()
transforms.append(('qt', QuantileTransformer(n_quantiles=100, output_distribution='normal')))
transforms.append(('kbd', KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')))
transforms.append(('pca', PCA(n_components=7)))
transforms.append(('svd', TruncatedSVD(n_components=7)))
#-------------------------------------------------- initialize the feature union
fu = FeatureUnion(transforms)
#-------------------------------------------------- define the feature selection
rfe = RFE(estimator=LogisticRegression(solver='liblinear'), n_features_to_select=30)
#-------------------------------------------------- define the model
model = LogisticRegression(solver='liblinear')
#-------------------------------------------------- use pipeline to chain operation
steps = list()
steps.append(('fu', fu))
steps.append(('rfe', rfe))
steps.append(('ml', model))
pipeline = Pipeline(steps=steps)
# define the cross-validation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f' % (mean(scores)))

Accuracy: 0.613
Wall time: 1min 39s


#!pip install imblearn
from imblearn.over_sampling import SMOTE
smote = SMOTE()


X_sm1 , y_sm1 = smote.fit_resample(X,y)
X_sm , y_sm = X_sm1 , y_sm1  # keep for fuuture plotting an cimparision


print("Before SMOTE: ", Counter(y))
print("After SMOTE: ", Counter(y_sm))

Before SMOTE:  Counter({2: 855, 3: 706, 8: 596, 6: 531, 1: 259, 5: 243, 4: 228, 9: 178, 7: 114})
After SMOTE:  Counter({3: 855, 2: 855, 8: 855, 6: 855, 7: 855, 4: 855, 5: 855, 9: 855, 1: 855})


scaler = StandardScaler()
X_sm = scaler.fit_transform(X_sm)


model_bal = LogisticRegression(solver='liblinear')
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model_bal, X_sm, y_sm, scoring='accuracy', cv=cv, n_jobs=-1)
print('Accuracy: %.3f' % (mean(scores)))

Accuracy: 0.652


data_fi = data


# logistic regression for feature importance
from sklearn.datasets import make_classification
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot
# define dataset
model = LinearRegression()
# fit the model
model.fit(X, y)
# get importance
importance = model.coef_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.title('Logistic Regression Coefficients as Feature Importance Scores')
pyplot.savefig('reg_importance.png')
pyplot.show()

Feature: 0, Score: 0.00106
Feature: 1, Score: -0.00753
Feature: 2, Score: 0.26791
Feature: 3, Score: -0.01909
Feature: 4, Score: 0.04268
Feature: 5, Score: 0.67395
Feature: 6, Score: 1.72536
Feature: 7, Score: 0.30698
Feature: 8, Score: 0.46538
Feature: 9, Score: -0.88411
Feature: 10, Score: 1.24697
Feature: 11, Score: -1.11251
Feature: 12, Score: 1.28419
Feature: 13, Score: -1.10428
Feature: 14, Score: 1.47229
Feature: 15, Score: -1.07719
Feature: 16, Score: 0.91050
Feature: 17, Score: -1.46548
Feature: 18, Score: 1.61115
Feature: 19, Score: -0.58955
Feature: 20, Score: 0.18510
Feature: 21, Score: -0.94244


from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
# fit the model
model.fit(X, y)
# get importance
importance = model.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.title('Decision tree classifier Feature Importance Scores')
pyplot.savefig('DTree.png')
pyplot.show()


#pip install xgboost
from xgboost import XGBClassifier
model = XGBClassifier()
# fit the model
model.fit(X, y)
# get importance
importance = model.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.title('XGBoost classifier Feature Importance Scores')
pyplot.show()


from sklearn.inspection import permutation_importance
model = LogisticRegression(solver='liblinear')
# fit the model
model.fit(X, y)
# perform permutation importance
results = permutation_importance(model, X, y, scoring='accuracy')
# get importance
importance = results.importances_mean
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.title('Permutation Feature Importance Scores')
pyplot.savefig('permu.png')
pyplot.show()


from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier


# define Classifiers
log = LogisticRegression()  
knn = KNeighborsClassifier()
dtree = DecisionTreeClassifier()
rtree = RandomForestClassifier()
svm = SVC()
nb = GaussianNB()
gbc = GradientBoostingClassifier()
etree = ExtraTreesClassifier()


def baseline_model(model_name):

    model = model_name
    steps = list()
    steps.append(('ss', StandardScaler() ))
    steps.append(('ml', model))
    pipeline = Pipeline(steps=steps)

    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
     # balanced X,y from SMOTE can also be used 
    scores = cross_val_score(pipeline, X_sm, y_sm, scoring='accuracy', cv=cv, n_jobs=-1)

    print(model,'Accuracy: %.3f' % (mean(scores)))


%%time
baseline_model(log)
baseline_model(knn)
baseline_model(dtree)
baseline_model(rtree)
baseline_model(svm)
baseline_model(nb)
baseline_model(gbc)
baseline_model(etree)

LogisticRegression() Accuracy: 0.664
KNeighborsClassifier() Accuracy: 0.872
DecisionTreeClassifier() Accuracy: 0.850
RandomForestClassifier() Accuracy: 0.916
SVC() Accuracy: 0.746
GaussianNB() Accuracy: 0.318
GradientBoostingClassifier() Accuracy: 0.850
ExtraTreesClassifier() Accuracy: 0.930
Wall time: 49.6 s


from sklearn.model_selection import GridSearchCV


X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.2, random_state=42)


%%time
#logistic regression classifier

#define hyper parameters and ranges
param_grid_log = [{'C': [0.1, 1, 10], 'solver': ['lbfgs', 'liblinear'], 
                   'max_iter':[100, 300]}]
#apply gridsearch
grid_log  = GridSearchCV(log, param_grid=param_grid_log, cv=5)
#fit model with grid search
grid_log.fit(X_train, y_train)
print('The best parameters for log classifier: ', grid_log.best_params_)

The best parameters for log classifier:  {'C': 10, 'max_iter': 300, 'solver': 'lbfgs'}
Wall time: 12.5 s


%%time
#kNN classifier

#define hyper parameters and ranges
param_grid_knn = [{'n_neighbors': [2, 3, 4, 6, 8, 10], 'weights': [ 'uniform', 'distance'], 
                   'metric': ['euclidean', 'manhattan', 'minkowski']}]
#apply gridsearch
grid_knn  = GridSearchCV(knn, param_grid=param_grid_knn, cv=5)
#fit model with grid search
grid_knn.fit(X_train, y_train)
print('The best parameters for knn classifier: ', grid_knn.best_params_)

The best parameters for knn classifier:  {'metric': 'manhattan', 'n_neighbors': 2, 'weights': 'distance'}
Wall time: 17.7 s


%%time
#decision tree classifier

#define hyper parameters and ranges
param_grid_dtree = [{'max_depth': [ 15, 20, 25, 30], 'criterion': ['gini',  'entropy']}]
#apply gridsearch
grid_dtree  = GridSearchCV(dtree, param_grid=param_grid_dtree, cv=5)
#fit model with grid search
grid_dtree.fit(X_train, y_train)
print('The best parameters for dtree classifier: ', grid_dtree.best_params_)

The best parameters for dtree classifier:  {'criterion': 'entropy', 'max_depth': 15}
Wall time: 1.9 s


%%time
#random forest classifier

#define hyper parameters and ranges
param_grid_rtree = [{'max_depth': [5, 10, 15, 20], 'n_estimators':[100,300,500] ,
                     'criterion': ['gini',  'entropy']}]
#apply gridsearch
grid_rtree  = GridSearchCV(rtree, param_grid=param_grid_rtree, cv=5)
#fit model with grid search
grid_rtree.fit(X_train, y_train)
print('The best parameters for rtree classifier: ', grid_rtree.best_params_)

The best parameters for rtree classifier:  {'criterion': 'entropy', 'max_depth': 20, 'n_estimators': 500}
Wall time: 3min 59s


%%time
#SVM classifier

#define hyper parameters and ranges
param_grid_svm = [{'C': [100, 50, 10, 1.0, 0.1, 0.01], 'gamma': ['scale'], 
                   'kernel': ['poly', 'rbf', 'sigmoid'] }]
#apply gridsearch
grid_svm  = GridSearchCV(svm, param_grid=param_grid_svm, cv=5)
#fit model with grid search
grid_svm.fit(X_train, y_train)
print('The best parameters for svm classifier: ', grid_svm.best_params_)

The best parameters for svm classifier:  {'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}
Wall time: 1min 12s


%%time
#gbc classifier

#define hyper parameters and ranges
param_grid_gbc = [{'learning_rate': [0.1, 1], 'n_estimators':[200,350,500]}]
#apply gridsearch
grid_gbc  = GridSearchCV(gbc, param_grid=param_grid_gbc, cv=5)
#fit model with grid search
grid_gbc.fit(X_train, y_train)
print('The best parameters for gbc classifier: ', grid_gbc.best_params_)

The best parameters for gbc classifier:  {'learning_rate': 0.1, 'n_estimators': 500}
Wall time: 14min 18s


%%time
#etree classifier

#define hyper parameters and ranges
param_grid_etree = [{'max_depth': [15, 20, 25, 30, 35], 'n_estimators':[200,350,500] , 
                     'criterion': ['gini',  'entropy']}]
#apply gridsearch
grid_etree  = GridSearchCV(etree, param_grid=param_grid_etree, cv=5)
#fit model with grid search
grid_etree.fit(X_train, y_train)
print('The best parameters for etree classifier: ', grid_etree.best_params_)

The best parameters for etree classifier:  {'criterion': 'gini', 'max_depth': 30, 'n_estimators': 350}
Wall time: 2min 59s


# define Classifiers with hyper params
log_hp = LogisticRegression(C = 10, solver = 'lbfgs', max_iter= 300 ) 
knn_hp = KNeighborsClassifier(metric = 'manhattan', n_neighbors=2,weights= 'distance')
dtree_hp = DecisionTreeClassifier(criterion = 'entropy', max_depth=30)
rtree_hp = RandomForestClassifier(criterion='entropy', max_depth=20, n_estimators=500)
svm_hp = SVC(C=100, gamma= 'scale', kernel='rbf'  )  
nb_hp = GaussianNB()
gbc_hp = GradientBoostingClassifier(learning_rate=0.1, n_estimators=500)
etree_hp = ExtraTreesClassifier(criterion='gini', max_depth=30, n_estimators=200)


%%time
baseline_model(log_hp)
baseline_model(knn_hp)
baseline_model(dtree_hp)
baseline_model(rtree_hp)
baseline_model(svm_hp)
baseline_model(nb_hp)
baseline_model(gbc_hp)
baseline_model(etree_hp)

LogisticRegression(C=10, max_iter=300) Accuracy: 0.666
KNeighborsClassifier(metric='manhattan', n_neighbors=2, weights='distance') Accuracy: 0.920
DecisionTreeClassifier(criterion='entropy', max_depth=30) Accuracy: 0.858
RandomForestClassifier(criterion='entropy', max_depth=20, n_estimators=500) Accuracy: 0.920
SVC(C=100) Accuracy: 0.872
GaussianNB() Accuracy: 0.318
GradientBoostingClassifier(n_estimators=500) Accuracy: 0.901
ExtraTreesClassifier(max_depth=30, n_estimators=200) Accuracy: 0.931
Wall time: 4min 18s


#create dataframe to compare accuracy results
dd = { 'model':['log' , 'knn',  'dtree', 'rtree', 'svm', 'nb', 'gbc', 'etree'],
      'Baseline_model':[0.623, 0.880, 0.845, 0.910, 0.777, 0.357, 0.832, 0.934],
      'Hyper_pram_model': [0.663, 0.925, 0.856, 0.920, 0.889, 0.348, 0.904, 0.931]}

df_comp = pd.DataFrame(dd) 
round(df_comp,2)


from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.metrics import precision_recall_fscore_support


# single example to model evaluation metrices one by one
model_log=LogisticRegression(C = 10, solver = 'lbfgs', max_iter= 300 ) 
model_log.fit(X_train, y_train)
y_pred_log = model_log.predict(X_test)


print(classification_report(y_test, y_pred_log, target_names= facies_labels))

              precision    recall  f1-score   support

          SS       0.76      0.84      0.80       169
        CSiS       0.57      0.55      0.56       182
        FSiS       0.71      0.64      0.67       184
        SiSh       0.67      0.77      0.72       164
          MS       0.52      0.60      0.56       166
          WS       0.47      0.38      0.42       170
           D       0.79      0.79      0.79       179
          PS       0.52      0.43      0.47       166
          BS       0.82      0.90      0.86       159

    accuracy                           0.65      1539
   macro avg       0.65      0.66      0.65      1539
weighted avg       0.65      0.65      0.65      1539


# define Classifiers
log = LogisticRegression(C = 10, solver = 'lbfgs', max_iter= 200 ) 
knn = KNeighborsClassifier(metric = 'manhattan', n_neighbors=2,weights= 'distance')
dtree = DecisionTreeClassifier(criterion = 'entropy', max_depth=20)
rtree = RandomForestClassifier(criterion='entropy', max_depth=20, n_estimators=500)
svm = SVC(C=100, gamma= 'scale', kernel='rbf'  )  
nb = GaussianNB()
gbc = GradientBoostingClassifier(learning_rate=0.1, n_estimators=500)
etree = ExtraTreesClassifier(criterion='gini', max_depth=35, n_estimators=500)


%%time
models = []
models.append(('log' , log))
models.append(('knn', knn))
models.append(('dtree', dtree))
models.append(('rtree', rtree))
models.append(('svm', svm))
models.append(('nb',  nb))
models.append(('gbc', gbc))
models.append(('etree', etree))
results = []
names = []
scoring =  ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for name, model in models:
    cv = KFold(n_splits=10, shuffle=True , random_state=42)
    cv_results = cross_validate(model, X_sm, y_sm, cv=cv, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    results.append(name)

Wall time: 10min 35s



results =  results[::2]


test_acc = []
for i in range (len(names)):
    test_acc.append(results[i]['test_accuracy'].mean())
    
test_f1 = []
for i in range (len(names)):
    test_f1.append(results[i]['test_f1_macro'].mean())  
    
test_pre = []
for i in range (len(names)):
    test_pre.append(results[i]['test_precision_macro'].mean())

test_rec = []
for i in range (len(names)):
    test_rec.append(results[i]['test_recall_macro'].mean())


category_names = names
result_data = {"accuracy_score": test_acc, 'f1score':    test_f1, 'precision': test_pre, 'recall':    test_rec,  }

def survey(result_data, category_names):
    """
    Parameters
    ----------
    results : dict
        A mapping from question labels to a list of answers per category.
        It is assumed all lists contain the same number of entries and that
        it matches the length of *category_names*.
    category_names : list of str
        The category labels.
    """
    labels = list(result_data.keys())
    data = np.array(list(result_data.values()))
    data_cum = data.cumsum(axis=1)
    category_colors = plt.get_cmap('RdYlGn')(
        np.linspace(0, 1, data.shape[1]))

    fig, ax = plt.subplots(figsize=(12, 3))
    ax.invert_yaxis()
    ax.xaxis.set_visible(False)
    ax.set_xlim(0, np.sum(data, axis=1).max())
   

    for i, (colname, color) in enumerate(zip(category_names, category_colors)):
        widths = data[:, i]
        starts = data_cum[:, i] - widths
        ax.barh(labels, widths, left=starts, height=0.7,
                label=colname, color=color)
        xcenters = starts + widths / 2

        r, g, b, _ = color
        text_color =  'k'
        for y, (x, c) in enumerate(zip(xcenters, widths)):
            ax.text(x, y, str(np.around(c,decimals = 2)), ha='center', va='center',
                    color=text_color)
    ax.legend(ncol=len(category_names), bbox_to_anchor=(0, -0.22),
              loc='lower left', fontsize='large')
    ax.set_title('Diffrent Metrics Average', loc='center')
    fig.savefig('fname_macro2', dpi=300)
    return fig, ax


survey(result_data, category_names)
plt.show()

#  np.around(recall, decimals = 2)


from sklearn.metrics import confusion_matrix
import itertools


# define function to implement confusion matrix with normalization capability
def plot_confusion_matrix(cm, classes, normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Reds):

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
#     plt.savefig('conf20', dpi=300)


%%time
# create Logistic Reg model with optimom hyper-parameters
model_log=LogisticRegression(C = 10, solver = 'lbfgs', max_iter= 300 ) 
model_log.fit(X_train, y_train)
# predict test labels
y_pred_log = model_log.predict(X_test)

Wall time: 533 ms

Confusion matrix, without normalization

Wall time: 4.99 ms

Confusion matrix, without normalization

Wall time: 7min 44s

Wall time: 20.2 s


#calculate confusion matrix
cnf_log = confusion_matrix(y_test, y_pred_log)


#Plot confusion matrix for RandomForest classifier
fig = plt.figure()
fig.set_size_inches(7.4, 6, forward=False)
plot_confusion_matrix(cnf_log, classes=np.asarray(facies_labels),
                      title='Confusion matrix (Logistic Regression classifier)')

Confusion matrix, without normalization


%%time
# create nb model with optimom hyper-parameters
model_nb = GaussianNB()
model_nb.fit(X_train, y_train)
# predict test labels
y_pred_nb = model_nb.predict(X_test)

Wall time: 4.99 ms


#calculate confusion matrix
cnf_nb = confusion_matrix(y_test, y_pred_nb)


#Plot confusion matrix for RandomForest classifier
fig = plt.figure()
fig.set_size_inches(7.4, 6, forward=True)
plot_confusion_matrix(cnf_nb, classes=np.asarray(facies_labels),
                      title='Confusion matrix (Naive Bayes classifier)')

Confusion matrix, without normalization


from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit


def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 10)):

    if axes is None:
        _, axes = plt.subplots(1, 3, figsize=(20, 5))

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes,
                       return_times=True)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
    axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
    axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    axes[0].legend(loc="best")

    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, 'o-')
    axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
                         fit_times_mean + fit_times_std, alpha=0.1)
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    # Plot fit_time vs score
    axes[2].grid()
    axes[2].plot(fit_times_mean, test_scores_mean, 'o-')
    axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1)
    axes[2].set_xlabel("fit_times")
    axes[2].set_ylabel("Score")
    axes[2].set_title("Performance of the model")

    return plt


%%time
fig, axes = plt.subplots(6, 4, figsize=(15, 30))
cv = 5
# use models tuple which already made with optimized hyper-parameters
title = "Learning Curves (Logistic Reg.)"
plot_learning_curve(models[0][1], title,  X_sm, y_sm, axes=axes[0:3, 0], ylim=(0.1, 1.01), cv=cv, n_jobs=4)

title = "Learning Curves(KNN)"
plot_learning_curve(models[1][1], title,  X_sm, y_sm, axes=axes[0:3, 1], ylim=(0.1, 1.01), cv=cv, n_jobs=4)

title = "Learning Curves(Decision tree)"
plot_learning_curve(models[2][1], title,  X_sm, y_sm, axes=axes[0:3, 2], ylim=(0.1, 1.01), cv=cv, n_jobs=4)

title = "Learning Curves(Random Forest)"
plot_learning_curve(models[3][1], title,  X_sm, y_sm, axes=axes[0:3, 3], ylim=(0.1, 1.01),  cv=5, n_jobs=4)

title = "Learning Curves(Support Vector)"
plot_learning_curve(models[4][1], title, X_sm, y_sm, axes=axes[3:7, 0], ylim=(0.1, 1.01),  cv=cv, n_jobs=4)

title = "Learning Curves(Naive Bayes)"
plot_learning_curve(models[5][1], title, X_sm, y_sm, axes=axes[3:7, 1], ylim=(0.1, 1.01),  cv=cv, n_jobs=4)

title = "Learning Curves(Gradient Boosting)"
plot_learning_curve(models[6][1], title, X_sm, y_sm, axes=axes[3:7, 2], ylim=(0.1, 1.01),   cv=cv, n_jobs=4)

title = "Learning Curves(Extra Tree)"
plot_learning_curve(models[7][1], title, X_sm, y_sm, axes=axes[3:7, 3], ylim=(0.1, 1.01),   cv=cv, n_jobs=4)
plt.savefig('Models1009.png', dpi=300)
plt.show()

Wall time: 7min 44s


from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
from sklearn.metrics import roc_auc_score


# Binarize the output
yy = label_binarize(y_sm, classes=[ 1, 2, 3, 4, 5, 6, 7, 8, 9])
# y = label_binarize(y, classes=np.asarray(facies_labels))
n_classes = yy.shape[1]
# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_sm, yy, test_size=.3,  random_state=0)
random_state = np.random.RandomState(0)
classifier = OneVsRestClassifier(log)
# y_score = classifier.fit(X_train, y_train).decision_function(X_test)
y_score = classifier.fit(X_train, y_train).predict_proba(X_test)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])


lw=1
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves
plt.figure()

colors = cycle(['b', 'y', 'g','r','k','c','m'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=lw,
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Logestic Regression Classifier')
plt.legend(loc="lower right")
plt.savefig('ROC_log.png', dpi=300)
plt.show()


blind_prep = blind.drop(columns=['Formation', 'Well Name', 'FaciesLabels'])

y_blind = blind_prep.pop('Facies')
X_blind = blind_prep


# scale
scaler1 = StandardScaler()
# X_blind = scaler.fit_transform(X_blind)
X_blind = scaler1.fit_transform(X_blind)


# define Classifiers
log = LogisticRegression(C = 10, solver = 'lbfgs', max_iter= 300 ) 
knn = KNeighborsClassifier(leaf_size = 10, n_neighbors=2)
dtree = DecisionTreeClassifier(criterion = 'entropy', max_depth=15)
rtree = RandomForestClassifier(criterion='entropy', max_depth=20, n_estimators=300)
svm = SVC(C=100, gamma=0.001)
nb = GaussianNB()
gbc = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100)
etree = ExtraTreesClassifier(criterion='entropy', max_depth=50, n_estimators=500)


%%time
log.fit(X_sm,y_sm)
y_hat_log = log.predict(X_blind)
blind['log_Pred'] = y_hat_log   #add to df
acc_log = (accuracy_score(y_blind, y_hat_log ))

knn.fit(X_sm,y_sm)
y_hat_knn = knn.predict(X_blind)
blind['knn_Pred'] = y_hat_knn   #add to df
acc_knn = (accuracy_score(y_blind, y_hat_knn ))

dtree.fit(X_sm,y_sm)
y_hat_dtree = dtree.predict(X_blind)
blind['dtree_Pred'] = y_hat_dtree   #add to df
acc_dtree = (accuracy_score(y_blind, y_hat_dtree ))

rtree.fit(X_sm,y_sm)
y_hat_rtree = rtree.predict(X_blind)
blind['rtree_Pred'] = y_hat_rtree   #add to df
acc_rtree = (accuracy_score(y_blind, y_hat_rtree ))

svm.fit(X_sm,y_sm)
y_hat_svm = svm.predict(X_blind)
blind['svm_Pred'] = y_hat_svm   #add to df
acc_svm = (accuracy_score(y_blind, y_hat_svm ))

gbc.fit(X_sm,y_sm)
y_hat_gbc = gbc.predict(X_blind)
blind['gbc_Pred'] = y_hat_gbc   #add to df
acc_gbc = (accuracy_score(y_blind, y_hat_gbc ))

etree.fit(X_sm,y_sm)
y_hat_etree = etree.predict(X_blind)
blind['etree_Pred'] = y_hat_etree   #add to df
acc_etree = (accuracy_score(y_blind, y_hat_etree ))

Wall time: 20.2 s


#create dataframe to compare accuracy results
d = { 'model':['log' , 'knn',  'dtree', 'rtree', 'svm', 'gbc', 'etree'],
      'test_score':[test_acc[0], test_acc[1],test_acc[2], test_acc[3],test_acc[4], test_acc[6], test_acc[7]],
      'blind_score': [acc_log, acc_knn, acc_dtree, acc_rtree, acc_svm, acc_gbc, acc_etree ]}

df_comp = pd.DataFrame(d)


round(df_comp,2)


import matplotlib.colors as colors
def compare_all_facies(logs, Pred1, Pred2, Pred3, Pred4, Pred5, Pred6, Pred7, facies_colors):
    #make sure logs are sorted by depth
    logs = logs.sort_values(by='Depth')
    cmap_facies = colors.ListedColormap(facies_colors[0:len(facies_colors)], 'indexed')
    ztop=logs.Depth.min(); zbot=logs.Depth.max()
    
    cluster1 = np.repeat(np.expand_dims(logs['Facies'].values,1), 100, 1)
    cluster2 = np.repeat(np.expand_dims(logs[Pred1].values,1), 100, 1)
    cluster3 = np.repeat(np.expand_dims(logs[Pred2].values,1), 100, 1)
    cluster4 = np.repeat(np.expand_dims(logs[Pred3].values,1), 100, 1)
    cluster5 = np.repeat(np.expand_dims(logs[Pred4].values,1), 100, 1)
    cluster6 = np.repeat(np.expand_dims(logs[Pred5].values,1), 100, 1)
    cluster7 = np.repeat(np.expand_dims(logs[Pred6].values,1), 100, 1)
    cluster8 = np.repeat(np.expand_dims(logs[Pred7].values,1), 100, 1)
   
    f, ax = plt.subplots(nrows=1, ncols=8, figsize=(12, 6))

    im1 = ax[0].imshow(cluster1, interpolation='none', aspect='auto',
                       cmap=cmap_facies,vmin=1,vmax=9)
    im2 = ax[1].imshow(cluster2, interpolation='none', aspect='auto',
                       cmap=cmap_facies,vmin=1,vmax=9)
    im3 = ax[2].imshow(cluster3, interpolation='none', aspect='auto',
                       cmap=cmap_facies,vmin=1,vmax=9)
    im4 = ax[3].imshow(cluster4, interpolation='none', aspect='auto',
                       cmap=cmap_facies,vmin=1,vmax=9)
    im5 = ax[4].imshow(cluster5, interpolation='none', aspect='auto',
                       cmap=cmap_facies,vmin=1,vmax=9)
    im6 = ax[5].imshow(cluster6, interpolation='none', aspect='auto',
                       cmap=cmap_facies,vmin=1,vmax=9)
    im7 = ax[6].imshow(cluster7, interpolation='none', aspect='auto',
                       cmap=cmap_facies,vmin=1,vmax=9)
    im8 = ax[7].imshow(cluster8, interpolation='none', aspect='auto',
                       cmap=cmap_facies,vmin=1,vmax=9)
       
    
    divider = make_axes_locatable(ax[7])
    cax = divider.append_axes("right", size="10%", pad=0.05)
    cbar=plt.colorbar(im8, cax=cax)
    cbar.set_label((5*' ').join([' SS ', 'CSiS', 'FSiS', 
                                'SiSh', ' MS ', ' WS ', ' D  ', 
                                ' PS ', ' BS ']))
    cbar.set_ticks(range(0,1)); cbar.set_ticklabels('')
    
    for i in range(len(ax)-8):
        ax[i].set_ylim(ztop,zbot)
        ax[i].invert_yaxis()
        ax[i].grid()
        ax[i].locator_params(axis='x', nbins=2)
    
    ax[0].set_xlabel('True_Facies'); ax[1].set_xlabel(Pred1); ax[2].set_xlabel(Pred2)
    ax[3].set_xlabel(Pred3); ax[4].set_xlabel(Pred4); ax[5].set_xlabel(Pred5)
    ax[6].set_xlabel(Pred6); ax[7].set_xlabel(Pred7)
    
    #ax[0].set_yticklabels([]) ;
    ax[1].set_yticklabels([]); ax[2].set_yticklabels([]); ax[3].set_yticklabels([]) 
    ax[4].set_yticklabels([]); ax[5].set_yticklabels([]); ax[6].set_yticklabels([])
    ax[7].set_yticklabels([])
    
    ax[0].set_xticklabels([]); ax[1].set_xticklabels([]); ax[2].set_xticklabels([])
    ax[3].set_xticklabels([]); ax[4].set_xticklabels([]); ax[5].set_xticklabels([])
    ax[6].set_xticklabels([]); ax[7].set_xticklabels([])

    f.suptitle('Various model predictions in well: %s'%logs.iloc[0]['Well Name'], fontsize=14,y=0.94)


compare_all_facies(blind,'log_Pred','knn_Pred','dtree_Pred', 'rtree_Pred','svm_Pred','gbc_Pred',
                   'etree_Pred', facies_colors)
# plt.savefig("Compo2.png", dpi=400)

	Facies	Formation	Well Name	Depth	GR	ILD_log10	DeltaPHI	PHIND	PE	NM_M	RELPOS
4144	5	C LM	CHURCHMAN BIBLE	3120.5	46.719	0.947	1.828	7.254	3.617	2	0.685
4145	5	C LM	CHURCHMAN BIBLE	3121.0	44.563	0.953	2.241	8.013	3.344	2	0.677
4146	5	C LM	CHURCHMAN BIBLE	3121.5	49.719	0.964	2.925	8.013	3.190	2	0.669
4147	5	C LM	CHURCHMAN BIBLE	3122.0	51.469	0.965	3.083	7.708	3.152	2	0.661
4148	5	C LM	CHURCHMAN BIBLE	3122.5	50.031	0.970	2.609	6.668	3.295	2	0.653

	Facies	Depth	GR	ILD_log10	DeltaPHI	PHIND	PE	NM_M	RELPOS
Facies	1.00	0.31	-0.39	0.38	-0.24	-0.36	0.70	0.85	0.08
Depth	0.31	1.00	-0.09	0.20	0.07	-0.10	0.28	0.28	0.00
GR	-0.39	-0.09	1.00	-0.21	0.18	0.27	-0.29	-0.32	-0.18
ILD_log10	0.38	0.20	-0.21	1.00	-0.10	-0.54	0.38	0.49	0.09
DeltaPHI	-0.24	0.07	0.18	-0.10	1.00	-0.19	0.01	-0.18	0.02
PHIND	-0.36	-0.10	0.27	-0.54	-0.19	1.00	-0.57	-0.48	-0.03
PE	0.70	0.28	-0.29	0.38	0.01	-0.57	1.00	0.66	0.02
NM_M	0.85	0.28	-0.32	0.49	-0.18	-0.48	0.66	1.00	0.03
RELPOS	0.08	0.00	-0.18	0.09	0.02	-0.03	0.02	0.03	1.00

	Facies	Formation	Well Name	Depth	GR	ILD_log10	DeltaPHI	PHIND	PE	NM_M	RELPOS	FaciesLabels	fm_A1 SH
0	3	A1 SH	SHRIMPLIN	2793.0	77.45	0.664	9.9	11.915	4.6	1	1.000	FSiS	1
1	3	A1 SH	SHRIMPLIN	2793.5	78.26	0.661	14.2	12.565	4.1	1	0.979	FSiS	1
2	3	A1 SH	SHRIMPLIN	2794.0	79.05	0.658	14.8	13.050	3.6	1	0.957	FSiS	1
3	3	A1 SH	SHRIMPLIN	2794.5	86.10	0.655	13.9	13.115	3.5	1	0.936	FSiS	1
4	3	A1 SH	SHRIMPLIN	2795.0	74.58	0.647	13.5	13.300	3.4	1	0.915	FSiS	1

	model	Baseline_model	Hyper_pram_model
0	log	0.62	0.66
1	knn	0.88	0.92
2	dtree	0.84	0.86
3	rtree	0.91	0.92
4	svm	0.78	0.89
5	nb	0.36	0.35
6	gbc	0.83	0.90
7	etree	0.93	0.93

1- Data Exploratory Analysis¶

2- Build Model & Validate¶

3- Model Evaluation-1¶

4- Model Evaluation-2¶

1 Data Exploratory Analysis¶

1-1 Data visualization¶

1-1-1 log-plot¶

1-1-2 Histogram¶

1-1-3 Cross-plot¶

Hereafter we will store dataset into new vriable after main operations (indented paragrpahs in introduction).¶

1-2 Feature Engineering¶

1-2-1 NaN imputation¶

Plot predecited PE¶

1-2-2 Feature extraction¶

Pick up a well as blind:¶

Baseline Model Performance¶

1-2-3 Oversampeling¶

1-3 Feature Importance¶

1-3-1 Feature linear corrolation¶

1-3-2 Decision tree¶

1-3-3 Permutation feature importance¶

2 Build Model & Validate¶

2-1 Baseline Model¶

2-2 Hyper-parameters¶

2-2-1 Grid search¶

Build model objects with optimized hyper-paramters and run baseline model with hyper-parameters¶

3- Model Evaluation-1¶

lets take average of each metrics¶

3-1 Model Metrics plot¶

3-2 Confusion matrix¶

4- Model Evaluation-2¶

4-1 Learning Curves¶

4-2 - ROC curve¶

4-3 Blind Prediction and evaluation¶

Blind well prepration¶

Build Final Models¶

Classification report below for Extras Tree Classifier:¶

Confusion matrix, Extras Tree Classifier:¶

Create a section to compare various model performance in facies prediction¶

Conclusion:¶

References:¶