import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import joblib
import warnings
warnings.filterwarnings('ignore')


# Importing the pandas library and reading the CSV file 'Zephyr Data Emo with new categories-Copy1.csv'
# The 'engine' parameter is set to 'python' to handle non-default delimiter options.
# The 'delimiter' parameter is set to ';' to specify that the data in the CSV file is delimited by semicolons.

raw_core = pd.read_csv('Zephyr Data Emo with new categories-Copy1.csv', engine='python', delimiter=';')


# Get the value counts for the 'Activities Detailed' column
value_counts = raw_core['Activities Detailed'].value_counts()

# Create the bar plot
plt.figure(figsize=(10, 6))  # Optional: Adjust the figure size if needed
plt.bar(value_counts.index, value_counts.values)

# Optional: Rotate the x-axis labels if they are too long
plt.xticks(rotation=90)

# Optional: Add labels and title
plt.xlabel('Activities Detailed')
plt.ylabel('Count')
plt.title('Value Counts of Activities Detailed')

# Show the plot
plt.tight_layout()  # Optional: To avoid cutoff of labels
plt.show()


# Creating a new DataFrame 'raw' by filtering out rows where the 'Activities Detailed' column is not equal to 'papper work'.
# This step is essentially excluding rows with the value 'papper work' in the 'Activities Detailed' column which has only 1 entry.
raw = raw_core[raw_core['Activities Detailed'] != 'papper work']


# Assuming 'raw' is a DataFrame containing the data.

# Extracting the 'Activities Detailed' column and assigning it to the 'y' variable.
y = raw[['Activities Detailed']]

# Creating a new DataFrame 'X' by dropping the 'Activities Detailed' and 'Activities' columns from the original DataFrame 'raw'.
# 'X' will contain the remaining features (columns) of the data.
X = raw.drop(columns=['Activities Detailed', 'Activities'])


sns.set_palette("Set1", desat=1)

# Create a FacetGrid
facetgrid = sns.FacetGrid(raw, hue='Activities Detailed', height=6, aspect=2)

# Map the distribution plot
facetgrid.map(sns.kdeplot, 'HR', fill=True)

# Add legend
facetgrid.add_legend()

# Annotations
plt.annotate("dream", xy=(55, 0.06), xytext=(60, 0.08),
             xycoords='data', textcoords='data',
             va='center', ha='right',
             arrowprops=dict(arrowstyle="simple", connectionstyle="arc3,rad=0.1"))

plt.annotate("Competition", xy=(175, 0.005), xytext=(180, 0.02),
             xycoords='data', textcoords='data',
             va='center', ha='left',
             arrowprops=dict(arrowstyle="simple", connectionstyle="arc3,rad=0.1"))

plt.annotate("training", xy=(100, 0.04), xytext=(110, 0.05),
             xycoords='data', textcoords='data',
             va='center', ha='left',
             arrowprops=dict(arrowstyle="simple", connectionstyle="arc3,rad=0.1"))

plt.annotate("training", xy=(100, 0.04), xytext=(110, 0.05),
             xycoords='data', textcoords='data',
             va='center', ha='left',
             arrowprops=dict(arrowstyle="simple", connectionstyle="arc3,rad=0.1"))

plt.annotate("public speaking", xy=(120, 0.02), xytext=(125, 0.03),
             xycoords='data', textcoords='data',
             va='center', ha='left',
             arrowprops=dict(arrowstyle="simple", connectionstyle="arc3,rad=0.1"))

# Show the plot
plt.show()


groups = raw.groupby('Activities Detailed')

# Create an empty dictionary to store the smaller dataframes
smaller_dfs = {}

# Iterate over each group and create separate dataframes
for label, group_df in groups:
    smaller_dfs[label] = group_df.copy()
    
sns.set_palette("Set1", desat=1)

# Iterate over each label and corresponding dataframe
for label, df in smaller_dfs.items():
    # Create a new plot for each dataframe
    plt.figure(figsize=(10, 6))

    # Create a FacetGrid for the current dataframe
    facetgrid = sns.FacetGrid(df, hue='Activities Detailed', height=3, aspect=2)

    # Map the distribution plot
    facetgrid.map(sns.kdeplot, 'HR', fill=True)

    # Add legend
    facetgrid.add_legend()
    
    mean_val = df['HR'].mean()
    min_val = df['HR'].min()
    max_val = df['HR'].max()

    # Add text annotations for mean, min, and max values
    plt.text(0.6, 0.8, f"Mean: {mean_val:.2f}", transform=plt.gca().transAxes)
    plt.text(0.6, 0.7, f"Min: {min_val:.2f}", transform=plt.gca().transAxes)
    plt.text(0.6, 0.6, f"Max: {max_val:.2f}", transform=plt.gca().transAxes)


    # Set title
    plt.title(label)

    # Show the plot
    plt.show();

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>


correlation_matrix = raw.drop(columns = ['Controled stress', ' stress',
       'Before Controled stress', 'After controlled stress','Jumps','JumpFlightTime']).select_dtypes(include=['float64', 'int64']).corr().round(2)

# Create the heatmap
plt.figure(figsize=(18, 8))  # Adjust the figure size as needed
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')

# Set the plot title
plt.title('Pairwise Correlation Heatmap')

# Show the plot
plt.show()


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify = y)


class DataCleanerTransformer:
    def __init__(self):
        # Define the columns that need to be converted to float
        self.float_columns = ['Activity', 'PeakAcceleration', 'ECGAmplitude', 'ECGNoise', 'CoreTemp',
                              'AvStepPeriod', 'AvForceDevRate', 'AvStepImpulse']
        
    def fit(self, data):
        # Nothing to fit in this case, so we return self.
        return self

    def transform(self, data):
        # Convert selected columns to float
        data[self.float_columns] = data[self.float_columns].apply(lambda x: x.str.replace(',', '.').astype(float))

        # Convert 'Time' column to pandas datetime format
        data['Time_New'] = pd.to_datetime(data['Time'], format="%d.%m.%Y %H:%M:%S")

        # Fill NaN values with 0
        data = data.fillna(0)
        
        return data

    def fit_transform(self, data):
        # Fit the transformer (in this case, there's nothing to fit)
        self.fit(data)

        # Transform the data using the transform method
        return self.transform(data)


# Example usage:
# Instantiate the DataCleanerTransformer class
data_cleaner = DataCleanerTransformer()

X_train_loaded = data_cleaner.fit_transform(X_train)


drop = [
        #The model overfits and remmember when what activities were done so they are removed
        'Year', 'Month', 'Weekday', 'Time', 'Time_New','Date','Hour',
    
        #We have few participants so the model is good at overfitting who did what
        'Name of the volunteer',
        
        #These are health KPIs of how the Holter machine works and aren't dependant on activity.
        'BRAmplitude','ECGAmplitude', 'HRConfidence',
        
        #These measure how the machine works and whether it detects noise. This is what we model.
        'ECGNoise','Jumps','JumpFlightTime'
    
        #Stress is mannualy inputted
#         ,'Controled stress',' stress', 'Before Controled stress', 'After controlled stress'
       ]


X_preprocessed = X_train_loaded.drop(columns = drop)


ratios = (X_preprocessed.nunique() / X_preprocessed.count()).sort_values(ascending=True)

# Create a bar plot to visualize the ratios
plt.figure(figsize=(10, 6))
plt.barh(ratios.index, ratios.values)
plt.xlabel('Ratio of Unique Values to Total Count')
plt.ylabel('Columns')
plt.title('Ratio of Unique Values to Total Count for each Column')
plt.tight_layout()
plt.show()


import matplotlib.pyplot as plt

# Assuming you have already calculated the ratio of variance to mean
ratios = (X_preprocessed.var() / X_preprocessed.mean()).sort_values(ascending=True)

# Create a bar plot to visualize the ratios
plt.figure(figsize=(10, 6))

# Color negative values in red, and non-negative values in the default color (blue)
colors = ['red' if value < 0 else 'blue' for value in ratios.values]

plt.barh(ratios.index, ratios.values, color=colors)
plt.xlabel('Ratio of Variance to Mean')
plt.ylabel('Columns')
plt.title('Ratio of Variance to Mean for each Column')
plt.tight_layout()
plt.show()


import matplotlib.pyplot as plt

# Assuming you have already calculated the ratio of unique values to total count and variance to mean
unique_ratios = (X_preprocessed.nunique() / X_preprocessed.count()).sort_values(ascending=True)
variance_ratios = (X_preprocessed.var() / X_preprocessed.mean()).sort_values(ascending=True)

# Create a scatter plot to combine the two ratios
plt.figure(figsize=(10, 6))


# Plot the scatter plot with unique-to-total count ratios on the x-axis and variance-to-mean ratios on the y-axis
plt.scatter(unique_ratios.values, variance_ratios.values, color=colors)

# Annotate each point with the column name next to the data point
for i, col in enumerate(unique_ratios.index):
    plt.annotate(col, xy=(unique_ratios.values[i], variance_ratios.values[i]),
                 xytext=(5, 0), textcoords='offset points')

# Set log scales for both x and y axes
plt.xscale('log')
plt.yscale('log')

plt.xlabel('Unique-to-Total Count Ratio')
plt.ylabel('Variance-to-Mean Ratio')
plt.title('Unique-to-Total Count vs. Variance-to-Mean Ratio for each Column (Log Scales)')
plt.tight_layout()

plt.show()


class AnyGreaterThanZeroTransformer:
    def __init__(self, columns, c_name):
        
        self.columns = columns
        self.c_name = c_name
        
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Concatenate the names of the specified columns into a new column
        X[f'{self.c_name}'] =   (X[self.columns] > 0).any(axis=1).astype(int)

        # Drop the original specified columns as they are no longer needed
        X.drop(columns=self.columns, inplace=True)

        return X

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)


X_non_binary = X_preprocessed.copy()


# Plot histograms for the specified columns
plt.figure(figsize=(10, 6))
X_preprocessed.hist(column=['Controled stress', ' stress', 'Before Controled stress', 'After controlled stress'], bins=10)
plt.xlabel('Values')
plt.ylabel('Frequency')
# plt.title('Histograms for the Specified Columns')
plt.tight_layout()
plt.show()

# Plot boxplots for the specified columns
plt.figure(figsize=(10, 6))
sns.boxplot(data=X_preprocessed[['Controled stress', ' stress', 'Before Controled stress', 'After controlled stress']])
plt.xlabel('Columns')
plt.ylabel('Values')
# plt.title('Boxplots for the Specified Columns')
plt.tight_layout()
plt.show()

<Figure size 1000x600 with 0 Axes>


# Use the fit_transform method to create the new column 'AnyGreaterThanZero'
Stress_Data = X_non_binary.copy()
Stress_Data = AnyGreaterThanZeroTransformer(columns=['Controled stress',' stress', 'Before Controled stress', 'After controlled stress'],
                                            c_name = 'Stress_Binary')\
                                            .fit_transform(X_non_binary)


# Plot histograms for the specified columns
plt.figure(figsize=(10, 6))
X_preprocessed.hist(column=['MajorImpacts','MinorImpacts'], bins=10)
plt.xlabel('Values')
plt.ylabel('Frequency')
# plt.title('Histograms for the Specified Columns')
plt.tight_layout()
plt.show()

# Plot boxplots for the specified columns
plt.figure(figsize=(10, 6))
sns.boxplot(data=X_preprocessed[['MajorImpacts','MinorImpacts']])
plt.xlabel('Columns')
plt.ylabel('Values')
# plt.title('Boxplots for the Specified Columns')
plt.tight_layout()
plt.show()

<Figure size 1000x600 with 0 Axes>


Impact_Data = AnyGreaterThanZeroTransformer(columns=['MajorImpacts','MinorImpacts']
                                           ,c_name = 'Impact_Binary')\
                                            .fit_transform(Stress_Data)


# Plot histograms for the specified columns
plt.figure(figsize=(10, 6))
X_preprocessed.hist(column=['AvForceDevRate', 'AvStepImpulse', 'AvStepPeriod'], bins=10)
plt.xlabel('Values')
plt.ylabel('Frequency')
# plt.title('Histograms for the Specified Columns')
plt.tight_layout()
plt.show()

# Plot boxplots for the specified columns
plt.figure(figsize=(10, 6))
sns.boxplot(data=X_preprocessed[['AvForceDevRate', 'AvStepImpulse', 'AvStepPeriod']])
plt.xlabel('Columns')
plt.ylabel('Values')
# plt.title('Boxplots for the Specified Columns')
plt.tight_layout()
plt.show()

<Figure size 1000x600 with 0 Axes>


Av_Data = AnyGreaterThanZeroTransformer(columns=['AvForceDevRate', 'AvStepImpulse', 'AvStepPeriod']
                                           ,c_name = 'Av_Binary')\
                                            .fit_transform(Stress_Data)


# Plot histograms for the specified columns
plt.figure(figsize=(10, 6))
X_preprocessed.hist(column=['PeakAcceleration', 'PeakAccelPhi', 'peakAccelTheta'], bins=10)
plt.xlabel('Values')
plt.ylabel('Frequency')
# plt.title('Histograms for the Specified Columns')
plt.tight_layout()
plt.show()

# Plot boxplots for the specified columns
plt.figure(figsize=(10, 6))
sns.boxplot(data=X_preprocessed[['PeakAcceleration', 'PeakAccelPhi', 'peakAccelTheta']])
plt.xlabel('Columns')
plt.ylabel('Values')
# plt.title('Boxplots for the Specified Columns')
plt.tight_layout()
plt.show()

<Figure size 1000x600 with 0 Axes>


Accel_Data = AnyGreaterThanZeroTransformer(columns=['PeakAcceleration']
                                           ,c_name = 'Acc_Binary')\
                                            .fit_transform(Stress_Data)


# Plot histograms for the specified columns
plt.figure(figsize=(10, 6))
X_preprocessed.hist(column=['Bounds', 'RunSteps'], bins=10)
plt.xlabel('Values')
plt.ylabel('Frequency')
# plt.title('Histograms for the Specified Columns')
plt.tight_layout()
plt.show()

# Plot boxplots for the specified columns
plt.figure(figsize=(10, 6))
sns.boxplot(data=X_preprocessed[['Bounds', 'RunSteps']])
plt.xlabel('Columns')
plt.ylabel('Values')
# plt.title('Boxplots for the Specified Columns')
plt.tight_layout()
plt.show()

<Figure size 1000x600 with 0 Axes>


Bounds_Data = AnyGreaterThanZeroTransformer(columns=['Bounds']
                                           ,c_name = 'Bounds_Binary')\
                                            .fit_transform(Stress_Data)


X_train_outlier = Bounds_Data.copy()


# Identify non-binary columns
non_binary_columns = ['HR','HRV','BR','Posture', 'Activity']
binary_clumns = X_train_outlier.drop(columns = non_binary_columns).columns.tolist()


# Plot boxplots for the specified columns
plt.figure(figsize=(10, 6))
sns.boxplot(data=X_train_outlier[non_binary_columns])
plt.xlabel('Columns')
plt.ylabel('Values')
# plt.title('Boxplots for the Specified Columns')
plt.tight_layout()
plt.show()


class OutlierReplacer:
    def __init__(self, columns):
        self.columns = columns
        self.iqr_ranges = {}

    def fit(self, X, y=None):
        for column in self.columns:
            Q1 = X[column].quantile(0.25)
            Q3 = X[column].quantile(0.75)
            IQR = Q3 - Q1
            self.iqr_ranges[column] = (Q1 - 1.5 * IQR, Q3 + 1.5 * IQR)

        return self

    def transform(self, X):
        X_transformed = X.copy()

        for column in self.columns:
            if column in self.iqr_ranges:
                outliers = (X_transformed[column] < self.iqr_ranges[column][0]) | (X_transformed[column] > self.iqr_ranges[column][1])
                detected_outliers = X_transformed[outliers]

                clean_data_iqr = X_transformed[~outliers].fillna(0)

                mean_value = clean_data_iqr[column].mean()
                noise = np.random.normal(0, 0.1, len(detected_outliers))
                mean_value_with_noise = noise + mean_value

                X_transformed.loc[outliers, column] = mean_value_with_noise

        return X_transformed

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)


# Instantiate the OutlierReplacer class and fit_transform on training data
outlier_replacer = OutlierReplacer(columns=non_binary_columns).fit(X_train_outlier)

X_train_less_outliers = outlier_replacer.transform(X_train_outlier)


scaler = StandardScaler().fit(X_train_less_outliers)


# Make a copy of the original DataFrame
X_train_scaled = X_train_less_outliers.copy()

# Instantiate the StandardScaler with the selected columns and fit and transform them
scaler = StandardScaler().fit(X_train_less_outliers[non_binary_columns])

X_train_scaled[non_binary_columns] = scaler.transform(X_train_less_outliers[non_binary_columns])


label_encoder = LabelEncoder().fit(y_train)


y_train['Target'] = label_encoder.transform(y_train)


y_train_target = y_train[['Target']]


clf1 = RandomForestClassifier(
    n_estimators=271,
    max_depth=36,
    min_samples_split=19,
    min_samples_leaf=11,
    random_state=42
)



clf2 = XGBClassifier(
    learning_rate=0.06048731265187917,
    n_estimators=195,
    max_depth=9,
    subsample= 0.7790510010086706,
    colsample_bytree=0.6220627611238871,
    random_state=42
)

clf3 = AdaBoostClassifier(
    n_estimators=170,
    learning_rate=0.2379351625419417,
    random_state=42
)

clf4 = GradientBoostingClassifier(
    n_estimators=186,
    learning_rate=0.01208563915935721,
    max_depth=9,
    subsample=0.7424149856794916,
    random_state=42
)


clf5 = DecisionTreeClassifier(
    max_depth=16,
    min_samples_split=9,
    min_samples_leaf=4,
    random_state=42
)


# Define the ensemble classifier with the scaler
models = VotingClassifier(estimators=[
    ('rfc', clf1), 
    ('xgb', clf2), 
    ('ada', clf3), 
    ('gbc', clf4), 
    ('dtc', clf5)],
    voting='soft')

# Fit the pipeline to the training data
models.fit(X_train_scaled[['HR','HRV','BR','Posture', 'Activity', 'Impact_Binary','Bounds_Binary']], y_train_target)

VotingClassifier(estimators=[('rfc',
                              RandomForestClassifier(max_depth=36,
                                                     min_samples_leaf=11,
                                                     min_samples_split=19,
                                                     n_estimators=271,
                                                     random_state=42)),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=0.6220627611238871,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval...
                             ('ada',
                              AdaBoostClassifier(learning_rate=0.2379351625419417,
                                                 n_estimators=170,
                                                 random_state=42)),
                             ('gbc',
                              GradientBoostingClassifier(learning_rate=0.01208563915935721,
                                                         max_depth=9,
                                                         n_estimators=186,
                                                         random_state=42,
                                                         subsample=0.7424149856794916)),
                             ('dtc',
                              DecisionTreeClassifier(max_depth=16,
                                                     min_samples_leaf=4,
                                                     min_samples_split=9,
                                                     random_state=42))],
                 voting='soft')

VotingClassifier(estimators=[('rfc',
                              RandomForestClassifier(max_depth=36,
                                                     min_samples_leaf=11,
                                                     min_samples_split=19,
                                                     n_estimators=271,
                                                     random_state=42)),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=0.6220627611238871,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval...
                             ('ada',
                              AdaBoostClassifier(learning_rate=0.2379351625419417,
                                                 n_estimators=170,
                                                 random_state=42)),
                             ('gbc',
                              GradientBoostingClassifier(learning_rate=0.01208563915935721,
                                                         max_depth=9,
                                                         n_estimators=186,
                                                         random_state=42,
                                                         subsample=0.7424149856794916)),
                             ('dtc',
                              DecisionTreeClassifier(max_depth=16,
                                                     min_samples_leaf=4,
                                                     min_samples_split=9,
                                                     random_state=42))],
                 voting='soft')

RandomForestClassifier(max_depth=36, min_samples_leaf=11, min_samples_split=19,
                       n_estimators=271, random_state=42)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.6220627611238871, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.06048731265187917,
              max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=9, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=195, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=42, ...)

AdaBoostClassifier(learning_rate=0.2379351625419417, n_estimators=170,
                   random_state=42)

GradientBoostingClassifier(learning_rate=0.01208563915935721, max_depth=9,
                           n_estimators=186, random_state=42,
                           subsample=0.7424149856794916)

DecisionTreeClassifier(max_depth=16, min_samples_leaf=4, min_samples_split=9,
                       random_state=42)

X


from sklearn.model_selection import RandomizedSearchCV
from scipy import stats
import dask.diagnostics

# Define the parameter distributions for each classifier
param_distributions = {
    'rfc__n_estimators': stats.randint(100, 500),
    'rfc__max_depth': [None] + list(np.arange(5, 51)),
    'rfc__min_samples_split': stats.randint(2, 20),
    'rfc__min_samples_leaf': stats.randint(1, 20),
    
    'xgb__learning_rate': stats.uniform(0.01, 0.3),
    'xgb__n_estimators': stats.randint(100, 500),
    'xgb__max_depth': stats.randint(2, 10),
    'xgb__subsample': stats.uniform(0.5, 0.5),
    'xgb__colsample_bytree': stats.uniform(0.5, 0.5),
    
    'ada__n_estimators': stats.randint(50, 200),
    'ada__learning_rate': stats.uniform(0.01, 1.0),
    
    'gbc__n_estimators': stats.randint(50, 200),
    'gbc__learning_rate': stats.uniform(0.01, 0.3),
    'gbc__max_depth': stats.randint(2, 10),
    'gbc__subsample': stats.uniform(0.5, 0.5),

    'dtc__max_depth': [None] + list(np.arange(5, 51)),
    'dtc__min_samples_split': stats.randint(2, 20),
    'dtc__min_samples_leaf': stats.randint(1, 20)
}


# # Perform the random search
# search = RandomizedSearchCV(models, param_distributions, n_iter=10, cv=3, verbose=2,  random_state=42)

# with dask.diagnostics.ProgressBar():
#     search.fit(X_train_scaled.sample(n= 10000, random_state= 42)[['HR','HRV','BR','Posture', 'Activity', 'Impact_Binary','Bounds_Binary']], y_train_target.sample(n= 10000, random_state= 42))


best_params_  = {'ada__learning_rate': 0.2379351625419417, 'ada__n_estimators': 170, 'dtc__max_depth': 16, 'dtc__min_samples_leaf': 9, 'dtc__min_samples_split': 4, 'gbc__learning_rate': 0.01208563915935721, 'gbc__max_depth': 9, 'gbc__n_estimators': 186, 'gbc__subsample': 0.7424149856794916, 'rfc__max_depth': 36, 'rfc__min_samples_leaf': 19, 'rfc__min_samples_split': 11, 'rfc__n_estimators': 271, 'xgb__colsample_bytree': 0.6220627611238871, 'xgb__learning_rate': 0.06048731265187917, 'xgb__max_depth': 9, 'xgb__n_estimators': 195, 'xgb__subsample': 0.7790510010086706}

# Check the best parameters
print(best_params_)

#{'ada__learning_rate': 0.2379351625419417, 'ada__n_estimators': 170, 'dtc__max_depth': 16, 'dtc__min_samples_leaf': 9, 'dtc__min_samples_split': 4, 'gbc__learning_rate': 0.01208563915935721, 'gbc__max_depth': 9, 'gbc__n_estimators': 186, 'gbc__subsample': 0.7424149856794916, 'rfc__max_depth': 36, 'rfc__min_samples_leaf': 19, 'rfc__min_samples_split': 11, 'rfc__n_estimators': 271, 'xgb__colsample_bytree': 0.6220627611238871, 'xgb__learning_rate': 0.06048731265187917, 'xgb__max_depth': 9, 'xgb__n_estimators': 195, 'xgb__subsample': 0.7790510010086706}

{'ada__learning_rate': 0.2379351625419417, 'ada__n_estimators': 170, 'dtc__max_depth': 16, 'dtc__min_samples_leaf': 9, 'dtc__min_samples_split': 4, 'gbc__learning_rate': 0.01208563915935721, 'gbc__max_depth': 9, 'gbc__n_estimators': 186, 'gbc__subsample': 0.7424149856794916, 'rfc__max_depth': 36, 'rfc__min_samples_leaf': 19, 'rfc__min_samples_split': 11, 'rfc__n_estimators': 271, 'xgb__colsample_bytree': 0.6220627611238871, 'xgb__learning_rate': 0.06048731265187917, 'xgb__max_depth': 9, 'xgb__n_estimators': 195, 'xgb__subsample': 0.7790510010086706}


y_pred_train = models.predict(X_train_scaled[['HR','HRV','BR','Posture', 'Activity', 'Impact_Binary','Bounds_Binary']])


# Generate a classification report
class_report = classification_report(y_pred_train, y_train_target)


print("Classification Report:")
print(class_report)

Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98        57
           1       0.97      0.94      0.95       448
           2       0.97      0.99      0.98      2004
           3       0.65      0.97      0.78      1680
           4       0.95      0.87      0.91      3627
           5       0.99      1.00      1.00     44166
           6       0.98      1.00      0.99      1252
           7       0.86      0.86      0.86      1755
           8       0.76      0.88      0.82      4412
           9       0.96      0.90      0.93     51942
          10       0.99      0.99      0.99      2782
          11       0.99      0.98      0.99      2218
          12       0.93      0.84      0.88      2461
          13       1.00      0.98      0.99      7251
          14       0.95      0.97      0.96      2163
          15       0.80      0.86      0.83      5265
          16       0.93      0.96      0.95     45417
          17       0.88      0.95      0.91      2692
          18       0.72      0.97      0.82       140
          19       0.97      0.98      0.97       371

    accuracy                           0.94    182103
   macro avg       0.91      0.94      0.92    182103
weighted avg       0.95      0.94      0.95    182103


# Save the pipeline to a file
filename = 'ensamble_fitted_model_summerschool2023_stress_HR_BR_HRV.joblib'
joblib.dump(models, filename)

['ensamble_fitted_model_summerschool2023_stress_HR_BR_HRV.joblib']


# Get feature importances for each individual model
feature_importances = pd.DataFrame()

for name, clf in models.named_estimators_.items():
    if hasattr(clf, 'feature_importances_'):
        feature_importances[name] = clf.feature_importances_

feature_importances.index = X_train_scaled[['HR','HRV','BR','Posture', 'Activity', 'Impact_Binary','Bounds_Binary']].columns


feature_importances


feature_importances.plot(kind='bar')

<Axes: >


# Calculate the correlation matrix
corr_matrix = feature_importances.corr()

# Create a heatmap of the correlation matrix
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()


X_test_initial = X_test.copy()
X_test_loaded = data_cleaner.transform(X_test_initial)   #data cleaning
X_test_preprocessed = X_test_loaded.drop(columns = drop) #columns drop
X_test_non_binary = X_test_preprocessed.copy()           #copy df


X_test_Stress_Data = X_test_non_binary.copy()
X_test_Stress_Data = AnyGreaterThanZeroTransformer(columns=['Controled stress',' stress', 'Before Controled stress', 'After controlled stress'],
                                            c_name = 'Stress_Binary')\
                                            .transform(X_test_non_binary)  #binary stress



X_test_Impact_Data = AnyGreaterThanZeroTransformer(columns=['MajorImpacts','MinorImpacts']
                                           ,c_name = 'Impact_Binary')\
                                            .transform(X_test_Stress_Data) #binary impacts

X_test_Av_Data = AnyGreaterThanZeroTransformer(columns=['AvForceDevRate', 'AvStepImpulse', 'AvStepPeriod']
                                           ,c_name = 'Av_Binary')\
                                            .transform(X_test_Impact_Data) #binary av columns 

X_test_Accel_Data = AnyGreaterThanZeroTransformer(columns=['PeakAcceleration']
                                           ,c_name = 'Acc_Binary')\
                                            .transform(X_test_Impact_Data) #binary accel

X_test_Bounds_Data = AnyGreaterThanZeroTransformer(columns=['Bounds']
                                           ,c_name = 'Bounds_Binary')\
                                            .transform(X_test_Accel_Data)  #binary bounds


X_test_outlier = X_test_Bounds_Data.copy()                                              #copying the data before outlier removal
X_test_less_outliers = outlier_replacer.transform(X_test_outlier)                       #outliers removal
X_test_scaled = X_test_less_outliers.copy()                                             #outliers df copy
X_test_scaled[non_binary_columns] = scaler.transform(X_test_scaled[non_binary_columns]) # replace the og df


y_test['Target'] = label_encoder.transform(y_test[['Activities Detailed']]) #transforming labels
y_test_target = y_test[['Target']]                 #predictor class


loaded_model = joblib.load('ensamble_fitted_model_summerschool2023_stress_HR_BR_HRV.joblib')


loaded_model

VotingClassifier(estimators=[('rfc',
                              RandomForestClassifier(max_depth=36,
                                                     min_samples_leaf=11,
                                                     min_samples_split=19,
                                                     n_estimators=271,
                                                     random_state=42)),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=0.6220627611238871,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval...
                             ('ada',
                              AdaBoostClassifier(learning_rate=0.2379351625419417,
                                                 n_estimators=170,
                                                 random_state=42)),
                             ('gbc',
                              GradientBoostingClassifier(learning_rate=0.01208563915935721,
                                                         max_depth=9,
                                                         n_estimators=186,
                                                         random_state=42,
                                                         subsample=0.7424149856794916)),
                             ('dtc',
                              DecisionTreeClassifier(max_depth=16,
                                                     min_samples_leaf=4,
                                                     min_samples_split=9,
                                                     random_state=42))],
                 voting='soft')

VotingClassifier(estimators=[('rfc',
                              RandomForestClassifier(max_depth=36,
                                                     min_samples_leaf=11,
                                                     min_samples_split=19,
                                                     n_estimators=271,
                                                     random_state=42)),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=0.6220627611238871,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval...
                             ('ada',
                              AdaBoostClassifier(learning_rate=0.2379351625419417,
                                                 n_estimators=170,
                                                 random_state=42)),
                             ('gbc',
                              GradientBoostingClassifier(learning_rate=0.01208563915935721,
                                                         max_depth=9,
                                                         n_estimators=186,
                                                         random_state=42,
                                                         subsample=0.7424149856794916)),
                             ('dtc',
                              DecisionTreeClassifier(max_depth=16,
                                                     min_samples_leaf=4,
                                                     min_samples_split=9,
                                                     random_state=42))],
                 voting='soft')

RandomForestClassifier(max_depth=36, min_samples_leaf=11, min_samples_split=19,
                       n_estimators=271, random_state=42)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.6220627611238871, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.06048731265187917,
              max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=9, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=195, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=42, ...)

AdaBoostClassifier(learning_rate=0.2379351625419417, n_estimators=170,
                   random_state=42)

GradientBoostingClassifier(learning_rate=0.01208563915935721, max_depth=9,
                           n_estimators=186, random_state=42,
                           subsample=0.7424149856794916)

DecisionTreeClassifier(max_depth=16, min_samples_leaf=4, min_samples_split=9,
                       random_state=42)


y_test_pred = loaded_model.predict(X_test_scaled[['HR','HRV','BR','Posture', 'Activity', 'Impact_Binary','Bounds_Binary']])


# Generate a classification report
test_class_report = classification_report(y_test_pred,y_test_target )


print("Classification Report:")
print(test_class_report)

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.96      0.95        28
           1       0.92      0.92      0.92       213
           2       0.94      0.98      0.96       969
           3       0.58      0.96      0.72       745
           4       0.93      0.85      0.89      1781
           5       0.99      0.99      0.99     21762
           6       0.97      0.99      0.98       617
           7       0.80      0.83      0.82       829
           8       0.72      0.84      0.78      2161
           9       0.94      0.87      0.91     25781
          10       0.98      0.99      0.98      1363
          11       0.96      0.95      0.96      1095
          12       0.85      0.81      0.83      1144
          13       0.99      0.97      0.98      3601
          14       0.93      0.96      0.95      1058
          15       0.75      0.81      0.78      2586
          16       0.92      0.95      0.93     22428
          17       0.80      0.89      0.85      1299
          18       0.53      0.71      0.61        70
          19       0.88      0.98      0.93       164

    accuracy                           0.93     89694
   macro avg       0.87      0.91      0.88     89694
weighted avg       0.93      0.93      0.93     89694


from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.colors as colors

# Define a colormap where zero values are white
cmap = colors.ListedColormap(['white', 'lightpink', 'lightblue'])

# Define the bounds for each color
bounds = [0,1,2,3]

# Create a normalization based on the bounds
norm = colors.BoundaryNorm(bounds, cmap.N)

# Compute the confusion matrix
cm = confusion_matrix(y_test_target, y_test_pred)

# Get sorted labels
labels = sorted(y_test['Activities Detailed'].unique())

# Create the heatmap
plt.figure(figsize=(10,7))
sns.heatmap(cm, annot=False, cmap=cmap, norm=norm, xticklabels=labels, yticklabels=labels)

plt.title('Confusion Matrix based on Frequencies', size=16)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')

plt.show()


# Generate a classification report
class_report = classification_report(y_pred_train, y_train_target, output_dict=True)

test_class_report = classification_report(y_test_pred,y_test_target, output_dict=True) 

# Create dataframes
df_test = pd.DataFrame(test_class_report).transpose()
df = pd.DataFrame(class_report).transpose()

# Add suffixes to the column names
df_test.columns = [str(col) + '_Test' for col in df_test.columns]
df.columns = [str(col) for col in df.columns]

# Concatenate the dataframes
result_df = pd.concat([df, df_test], axis=1).reset_index(drop=False).iloc[:-3,:]
macro_df = pd.concat([df, df_test], axis=1).reset_index(drop=False).iloc[-3:,:]


result_df['index'] = result_df['index'].astype(int)


result_df['Label'] = label_encoder.inverse_transform(result_df['index'])


result_df


result_df.iloc[:-3,:][['precision','precision_Test']].plot(kind='bar')

<Axes: >

	rfc	xgb	ada	gbc	dtc
HR	0.211334	0.063243	0.052941	0.263226	0.180447
HRV	0.193873	0.048195	0.123529	0.205495	0.228883
BR	0.086372	0.026917	0.182353	0.071380	0.076369
Posture	0.240190	0.072173	0.288235	0.239884	0.218524
Activity	0.027014	0.009675	0.023529	0.015290	0.009971
Impact_Binary	0.161471	0.522415	0.205882	0.134348	0.193274
Bounds_Binary	0.079747	0.257381	0.123529	0.070377	0.092531

	index	precision	recall	f1-score	support	precision_Test	recall_Test	f1-score_Test	support_Test	Label
0	0	0.966102	1.000000	0.982759	57.0	0.931034	0.964286	0.947368	28.0	Coherent Breathing
1	1	0.974419	0.935268	0.954442	448.0	0.924528	0.920188	0.922353	213.0	Play Quitar
2	2	0.966423	0.991018	0.978566	2004.0	0.940711	0.982456	0.961131	969.0	cognitive workout
3	3	0.647551	0.967857	0.775948	1680.0	0.576395	0.957047	0.719475	745.0	competition
4	4	0.952439	0.866832	0.907621	3627.0	0.927429	0.846715	0.885236	1781.0	creative writing
5	5	0.991900	0.998121	0.995001	44166.0	0.988853	0.994670	0.991753	21762.0	dream
6	6	0.984252	0.998403	0.991277	1252.0	0.974400	0.987034	0.980676	617.0	leisure
7	7	0.861206	0.862678	0.861941	1755.0	0.799076	0.834741	0.816519	829.0	meeting with client
8	8	0.764510	0.883726	0.819807	4412.0	0.723726	0.841277	0.778087	2161.0	negotiation
9	9	0.957183	0.895210	0.925160	51942.0	0.940739	0.873124	0.905671	25781.0	office work
10	10	0.990298	0.990654	0.990476	2782.0	0.980306	0.986060	0.983175	1363.0	phone calls
11	11	0.992230	0.978810	0.985474	2218.0	0.963788	0.947945	0.955801	1095.0	play piano
12	12	0.925460	0.837464	0.879266	2461.0	0.847767	0.812937	0.829987	1144.0	presentation
13	13	0.995385	0.981658	0.988474	7251.0	0.987791	0.966121	0.976836	3601.0	public speaking
14	14	0.949458	0.972723	0.960950	2163.0	0.931256	0.960302	0.945556	1058.0	reading book
15	15	0.804282	0.863248	0.832723	5265.0	0.748473	0.805491	0.775936	2586.0	sales
16	16	0.932961	0.963384	0.947928	45417.0	0.918788	0.946317	0.932349	22428.0	sport
17	17	0.875300	0.949108	0.910711	2692.0	0.804590	0.890685	0.845451	1299.0	training
18	18	0.715789	0.971429	0.824242	140.0	0.531915	0.714286	0.609756	70.0	walking meeting
19	19	0.967914	0.975741	0.971812	371.0	0.875000	0.981707	0.925287	164.0	writing study

Dependencies¶

Data Load¶

Data Prep¶

Data Analysis¶

Data Split¶

Initial Data Cleaning¶

Data Reduction¶

Data Exploration¶

Non Zero transformer¶

Stress¶

Impact¶

Av* Data¶

Peak Data¶

Bounds¶

Data Cleaning¶

Outlier Replacer¶

Scaler¶

Label Encoder¶

Model¶

Save model¶

Feature Importance¶

X_test & Y_test prep¶

Model Loading¶

	Time	Year	Month	Weekday	Hour	HR	BR	Posture	Activity	PeakAcceleration	...	AvStepImpulse	AvStepPeriod	JumpFlightTime	PeakAccelPhi	peakAccelTheta	Controled stress	stress	Before Controled stress	After controlled stress	Name of the volunteer
0	26.3.2020 9:02:28	2020	March	Thursday	9	79	15	16	0,01	0,02	...	0	0	0	163	-50	1.0	NaN	NaN	NaN	EM
1	26.3.2020 9:02:29	2020	March	Thursday	9	79	15	16	0,01	0,1	...	0	0	0	168	-87	1.0	NaN	NaN	NaN	EM
2	26.3.2020 9:02:30	2020	March	Thursday	9	79	15	17	0,01	0,03	...	0	0	0	164	-47	1.0	NaN	NaN	NaN	EM
3	26.3.2020 9:02:31	2020	March	Thursday	9	79	15	17	0,01	0,02	...	0	0	0	163	-43	1.0	NaN	NaN	NaN	EM
4	26.3.2020 9:02:32	2020	March	Thursday	9	79	15	17	0	0,02	...	0	0	0	163	-50	1.0	NaN	NaN	NaN	EM
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
271793	22.6.2022 6:35:10	2022	June	Wednesday	6	84	20	-34	0,04	0,08	...	2,75	0,351	0	140	91	NaN	1.0	NaN	NaN	EM
271794	22.6.2022 6:35:11	2022	June	Wednesday	6	84	20	-35	0,01	0,03	...	0	0	0	142	91	NaN	1.0	NaN	NaN	EM
271795	22.6.2022 6:35:12	2022	June	Wednesday	6	84	19	-36	0,04	0,19	...	0	0	0	133	95	NaN	1.0	NaN	NaN	EM
271796	22.6.2022 6:35:13	2022	June	Wednesday	6	85	19	-38	0,05	0,09	...	0	0	0	135	95	NaN	1.0	NaN	NaN	EM
271797	22.6.2022 6:35:14	2022	June	Wednesday	6	85	19	-40	0,06	0,1	...	0	0	0	134	99	NaN	1.0	NaN	NaN	EM