Dependencies¶

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import joblib
import warnings
warnings.filterwarnings('ignore')

Data Load¶

In [2]:
# Importing the pandas library and reading the CSV file 'Zephyr Data Emo with new categories-Copy1.csv'
# The 'engine' parameter is set to 'python' to handle non-default delimiter options.
# The 'delimiter' parameter is set to ';' to specify that the data in the CSV file is delimited by semicolons.

raw_core = pd.read_csv('Zephyr Data Emo with new categories-Copy1.csv', engine='python', delimiter=';')
In [3]:
# Get the value counts for the 'Activities Detailed' column
value_counts = raw_core['Activities Detailed'].value_counts()

# Create the bar plot
plt.figure(figsize=(10, 6))  # Optional: Adjust the figure size if needed
plt.bar(value_counts.index, value_counts.values)

# Optional: Rotate the x-axis labels if they are too long
plt.xticks(rotation=90)

# Optional: Add labels and title
plt.xlabel('Activities Detailed')
plt.ylabel('Count')
plt.title('Value Counts of Activities Detailed')

# Show the plot
plt.tight_layout()  # Optional: To avoid cutoff of labels
plt.show()
In [4]:
# Creating a new DataFrame 'raw' by filtering out rows where the 'Activities Detailed' column is not equal to 'papper work'.
# This step is essentially excluding rows with the value 'papper work' in the 'Activities Detailed' column which has only 1 entry.
raw = raw_core[raw_core['Activities Detailed'] != 'papper work']
In [5]:
# Assuming 'raw' is a DataFrame containing the data.

# Extracting the 'Activities Detailed' column and assigning it to the 'y' variable.
y = raw[['Activities Detailed']]

# Creating a new DataFrame 'X' by dropping the 'Activities Detailed' and 'Activities' columns from the original DataFrame 'raw'.
# 'X' will contain the remaining features (columns) of the data.
X = raw.drop(columns=['Activities Detailed', 'Activities'])

Data Prep¶

Data Analysis¶

In [6]:
sns.set_palette("Set1", desat=1)

# Create a FacetGrid
facetgrid = sns.FacetGrid(raw, hue='Activities Detailed', height=6, aspect=2)

# Map the distribution plot
facetgrid.map(sns.kdeplot, 'HR', fill=True)

# Add legend
facetgrid.add_legend()

# Annotations
plt.annotate("dream", xy=(55, 0.06), xytext=(60, 0.08),
             xycoords='data', textcoords='data',
             va='center', ha='right',
             arrowprops=dict(arrowstyle="simple", connectionstyle="arc3,rad=0.1"))

plt.annotate("Competition", xy=(175, 0.005), xytext=(180, 0.02),
             xycoords='data', textcoords='data',
             va='center', ha='left',
             arrowprops=dict(arrowstyle="simple", connectionstyle="arc3,rad=0.1"))

plt.annotate("training", xy=(100, 0.04), xytext=(110, 0.05),
             xycoords='data', textcoords='data',
             va='center', ha='left',
             arrowprops=dict(arrowstyle="simple", connectionstyle="arc3,rad=0.1"))

plt.annotate("training", xy=(100, 0.04), xytext=(110, 0.05),
             xycoords='data', textcoords='data',
             va='center', ha='left',
             arrowprops=dict(arrowstyle="simple", connectionstyle="arc3,rad=0.1"))

plt.annotate("public speaking", xy=(120, 0.02), xytext=(125, 0.03),
             xycoords='data', textcoords='data',
             va='center', ha='left',
             arrowprops=dict(arrowstyle="simple", connectionstyle="arc3,rad=0.1"))

# Show the plot
plt.show()
In [7]:
groups = raw.groupby('Activities Detailed')

# Create an empty dictionary to store the smaller dataframes
smaller_dfs = {}

# Iterate over each group and create separate dataframes
for label, group_df in groups:
    smaller_dfs[label] = group_df.copy()
    
sns.set_palette("Set1", desat=1)

# Iterate over each label and corresponding dataframe
for label, df in smaller_dfs.items():
    # Create a new plot for each dataframe
    plt.figure(figsize=(10, 6))

    # Create a FacetGrid for the current dataframe
    facetgrid = sns.FacetGrid(df, hue='Activities Detailed', height=3, aspect=2)

    # Map the distribution plot
    facetgrid.map(sns.kdeplot, 'HR', fill=True)

    # Add legend
    facetgrid.add_legend()
    
    mean_val = df['HR'].mean()
    min_val = df['HR'].min()
    max_val = df['HR'].max()

    # Add text annotations for mean, min, and max values
    plt.text(0.6, 0.8, f"Mean: {mean_val:.2f}", transform=plt.gca().transAxes)
    plt.text(0.6, 0.7, f"Min: {min_val:.2f}", transform=plt.gca().transAxes)
    plt.text(0.6, 0.6, f"Max: {max_val:.2f}", transform=plt.gca().transAxes)


    # Set title
    plt.title(label)

    # Show the plot
    plt.show();
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
In [8]:
correlation_matrix = raw.drop(columns = ['Controled stress', ' stress',
       'Before Controled stress', 'After controlled stress','Jumps','JumpFlightTime']).select_dtypes(include=['float64', 'int64']).corr().round(2)

# Create the heatmap
plt.figure(figsize=(18, 8))  # Adjust the figure size as needed
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')

# Set the plot title
plt.title('Pairwise Correlation Heatmap')

# Show the plot
plt.show()
In [ ]:
 
In [ ]:
 

Data Split¶

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify = y)
In [ ]:
 

Initial Data Cleaning¶

In [10]:
class DataCleanerTransformer:
    def __init__(self):
        # Define the columns that need to be converted to float
        self.float_columns = ['Activity', 'PeakAcceleration', 'ECGAmplitude', 'ECGNoise', 'CoreTemp',
                              'AvStepPeriod', 'AvForceDevRate', 'AvStepImpulse']
        
    def fit(self, data):
        # Nothing to fit in this case, so we return self.
        return self

    def transform(self, data):
        # Convert selected columns to float
        data[self.float_columns] = data[self.float_columns].apply(lambda x: x.str.replace(',', '.').astype(float))

        # Convert 'Time' column to pandas datetime format
        data['Time_New'] = pd.to_datetime(data['Time'], format="%d.%m.%Y %H:%M:%S")

        # Fill NaN values with 0
        data = data.fillna(0)
        
        return data

    def fit_transform(self, data):
        # Fit the transformer (in this case, there's nothing to fit)
        self.fit(data)

        # Transform the data using the transform method
        return self.transform(data)


# Example usage:
# Instantiate the DataCleanerTransformer class
data_cleaner = DataCleanerTransformer()

X_train_loaded = data_cleaner.fit_transform(X_train)

Data Reduction¶

In [11]:
drop = [
        #The model overfits and remmember when what activities were done so they are removed
        'Year', 'Month', 'Weekday', 'Time', 'Time_New','Date','Hour',
    
        #We have few participants so the model is good at overfitting who did what
        'Name of the volunteer',
        
        #These are health KPIs of how the Holter machine works and aren't dependant on activity.
        'BRAmplitude','ECGAmplitude', 'HRConfidence',
        
        #These measure how the machine works and whether it detects noise. This is what we model.
        'ECGNoise','Jumps','JumpFlightTime'
    
        #Stress is mannualy inputted
#         ,'Controled stress',' stress', 'Before Controled stress', 'After controlled stress'
       ]
In [12]:
X_preprocessed = X_train_loaded.drop(columns = drop)

Data Exploration¶

In [13]:
ratios = (X_preprocessed.nunique() / X_preprocessed.count()).sort_values(ascending=True)

# Create a bar plot to visualize the ratios
plt.figure(figsize=(10, 6))
plt.barh(ratios.index, ratios.values)
plt.xlabel('Ratio of Unique Values to Total Count')
plt.ylabel('Columns')
plt.title('Ratio of Unique Values to Total Count for each Column')
plt.tight_layout()
plt.show()
In [14]:
import matplotlib.pyplot as plt

# Assuming you have already calculated the ratio of variance to mean
ratios = (X_preprocessed.var() / X_preprocessed.mean()).sort_values(ascending=True)

# Create a bar plot to visualize the ratios
plt.figure(figsize=(10, 6))

# Color negative values in red, and non-negative values in the default color (blue)
colors = ['red' if value < 0 else 'blue' for value in ratios.values]

plt.barh(ratios.index, ratios.values, color=colors)
plt.xlabel('Ratio of Variance to Mean')
plt.ylabel('Columns')
plt.title('Ratio of Variance to Mean for each Column')
plt.tight_layout()
plt.show()
In [15]:
import matplotlib.pyplot as plt

# Assuming you have already calculated the ratio of unique values to total count and variance to mean
unique_ratios = (X_preprocessed.nunique() / X_preprocessed.count()).sort_values(ascending=True)
variance_ratios = (X_preprocessed.var() / X_preprocessed.mean()).sort_values(ascending=True)

# Create a scatter plot to combine the two ratios
plt.figure(figsize=(10, 6))


# Plot the scatter plot with unique-to-total count ratios on the x-axis and variance-to-mean ratios on the y-axis
plt.scatter(unique_ratios.values, variance_ratios.values, color=colors)

# Annotate each point with the column name next to the data point
for i, col in enumerate(unique_ratios.index):
    plt.annotate(col, xy=(unique_ratios.values[i], variance_ratios.values[i]),
                 xytext=(5, 0), textcoords='offset points')

# Set log scales for both x and y axes
plt.xscale('log')
plt.yscale('log')

plt.xlabel('Unique-to-Total Count Ratio')
plt.ylabel('Variance-to-Mean Ratio')
plt.title('Unique-to-Total Count vs. Variance-to-Mean Ratio for each Column (Log Scales)')
plt.tight_layout()

plt.show()

Non Zero transformer¶

In [16]:
class AnyGreaterThanZeroTransformer:
    def __init__(self, columns, c_name):
        
        self.columns = columns
        self.c_name = c_name
        
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Concatenate the names of the specified columns into a new column
        X[f'{self.c_name}'] =   (X[self.columns] > 0).any(axis=1).astype(int)

        # Drop the original specified columns as they are no longer needed
        X.drop(columns=self.columns, inplace=True)

        return X

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)
In [17]:
X_non_binary = X_preprocessed.copy()

Stress¶

In [18]:
# Plot histograms for the specified columns
plt.figure(figsize=(10, 6))
X_preprocessed.hist(column=['Controled stress', ' stress', 'Before Controled stress', 'After controlled stress'], bins=10)
plt.xlabel('Values')
plt.ylabel('Frequency')
# plt.title('Histograms for the Specified Columns')
plt.tight_layout()
plt.show()

# Plot boxplots for the specified columns
plt.figure(figsize=(10, 6))
sns.boxplot(data=X_preprocessed[['Controled stress', ' stress', 'Before Controled stress', 'After controlled stress']])
plt.xlabel('Columns')
plt.ylabel('Values')
# plt.title('Boxplots for the Specified Columns')
plt.tight_layout()
plt.show()
<Figure size 1000x600 with 0 Axes>
In [19]:
# Use the fit_transform method to create the new column 'AnyGreaterThanZero'
Stress_Data = X_non_binary.copy()
Stress_Data = AnyGreaterThanZeroTransformer(columns=['Controled stress',' stress', 'Before Controled stress', 'After controlled stress'],
                                            c_name = 'Stress_Binary')\
                                            .fit_transform(X_non_binary)

Impact¶

In [20]:
# Plot histograms for the specified columns
plt.figure(figsize=(10, 6))
X_preprocessed.hist(column=['MajorImpacts','MinorImpacts'], bins=10)
plt.xlabel('Values')
plt.ylabel('Frequency')
# plt.title('Histograms for the Specified Columns')
plt.tight_layout()
plt.show()

# Plot boxplots for the specified columns
plt.figure(figsize=(10, 6))
sns.boxplot(data=X_preprocessed[['MajorImpacts','MinorImpacts']])
plt.xlabel('Columns')
plt.ylabel('Values')
# plt.title('Boxplots for the Specified Columns')
plt.tight_layout()
plt.show()
<Figure size 1000x600 with 0 Axes>
In [21]:
Impact_Data = AnyGreaterThanZeroTransformer(columns=['MajorImpacts','MinorImpacts']
                                           ,c_name = 'Impact_Binary')\
                                            .fit_transform(Stress_Data)

Av* Data¶

In [22]:
# Plot histograms for the specified columns
plt.figure(figsize=(10, 6))
X_preprocessed.hist(column=['AvForceDevRate', 'AvStepImpulse', 'AvStepPeriod'], bins=10)
plt.xlabel('Values')
plt.ylabel('Frequency')
# plt.title('Histograms for the Specified Columns')
plt.tight_layout()
plt.show()

# Plot boxplots for the specified columns
plt.figure(figsize=(10, 6))
sns.boxplot(data=X_preprocessed[['AvForceDevRate', 'AvStepImpulse', 'AvStepPeriod']])
plt.xlabel('Columns')
plt.ylabel('Values')
# plt.title('Boxplots for the Specified Columns')
plt.tight_layout()
plt.show()
<Figure size 1000x600 with 0 Axes>
In [23]:
Av_Data = AnyGreaterThanZeroTransformer(columns=['AvForceDevRate', 'AvStepImpulse', 'AvStepPeriod']
                                           ,c_name = 'Av_Binary')\
                                            .fit_transform(Stress_Data)

Peak Data¶

In [24]:
# Plot histograms for the specified columns
plt.figure(figsize=(10, 6))
X_preprocessed.hist(column=['PeakAcceleration', 'PeakAccelPhi', 'peakAccelTheta'], bins=10)
plt.xlabel('Values')
plt.ylabel('Frequency')
# plt.title('Histograms for the Specified Columns')
plt.tight_layout()
plt.show()

# Plot boxplots for the specified columns
plt.figure(figsize=(10, 6))
sns.boxplot(data=X_preprocessed[['PeakAcceleration', 'PeakAccelPhi', 'peakAccelTheta']])
plt.xlabel('Columns')
plt.ylabel('Values')
# plt.title('Boxplots for the Specified Columns')
plt.tight_layout()
plt.show()
<Figure size 1000x600 with 0 Axes>
In [25]:
Accel_Data = AnyGreaterThanZeroTransformer(columns=['PeakAcceleration']
                                           ,c_name = 'Acc_Binary')\
                                            .fit_transform(Stress_Data)

Bounds¶

In [26]:
# Plot histograms for the specified columns
plt.figure(figsize=(10, 6))
X_preprocessed.hist(column=['Bounds', 'RunSteps'], bins=10)
plt.xlabel('Values')
plt.ylabel('Frequency')
# plt.title('Histograms for the Specified Columns')
plt.tight_layout()
plt.show()

# Plot boxplots for the specified columns
plt.figure(figsize=(10, 6))
sns.boxplot(data=X_preprocessed[['Bounds', 'RunSteps']])
plt.xlabel('Columns')
plt.ylabel('Values')
# plt.title('Boxplots for the Specified Columns')
plt.tight_layout()
plt.show()
<Figure size 1000x600 with 0 Axes>
In [27]:
Bounds_Data = AnyGreaterThanZeroTransformer(columns=['Bounds']
                                           ,c_name = 'Bounds_Binary')\
                                            .fit_transform(Stress_Data)

Data Cleaning¶

In [28]:
X_train_outlier = Bounds_Data.copy()
In [29]:
# Identify non-binary columns
non_binary_columns = ['HR','HRV','BR','Posture', 'Activity']
binary_clumns = X_train_outlier.drop(columns = non_binary_columns).columns.tolist()
In [30]:
# Plot boxplots for the specified columns
plt.figure(figsize=(10, 6))
sns.boxplot(data=X_train_outlier[non_binary_columns])
plt.xlabel('Columns')
plt.ylabel('Values')
# plt.title('Boxplots for the Specified Columns')
plt.tight_layout()
plt.show()

Outlier Replacer¶

In [31]:
class OutlierReplacer:
    def __init__(self, columns):
        self.columns = columns
        self.iqr_ranges = {}

    def fit(self, X, y=None):
        for column in self.columns:
            Q1 = X[column].quantile(0.25)
            Q3 = X[column].quantile(0.75)
            IQR = Q3 - Q1
            self.iqr_ranges[column] = (Q1 - 1.5 * IQR, Q3 + 1.5 * IQR)

        return self

    def transform(self, X):
        X_transformed = X.copy()

        for column in self.columns:
            if column in self.iqr_ranges:
                outliers = (X_transformed[column] < self.iqr_ranges[column][0]) | (X_transformed[column] > self.iqr_ranges[column][1])
                detected_outliers = X_transformed[outliers]

                clean_data_iqr = X_transformed[~outliers].fillna(0)

                mean_value = clean_data_iqr[column].mean()
                noise = np.random.normal(0, 0.1, len(detected_outliers))
                mean_value_with_noise = noise + mean_value

                X_transformed.loc[outliers, column] = mean_value_with_noise

        return X_transformed

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)
In [32]:
# Instantiate the OutlierReplacer class and fit_transform on training data
outlier_replacer = OutlierReplacer(columns=non_binary_columns).fit(X_train_outlier)

X_train_less_outliers = outlier_replacer.transform(X_train_outlier)

Scaler¶

In [33]:
scaler = StandardScaler().fit(X_train_less_outliers)
In [34]:
# Make a copy of the original DataFrame
X_train_scaled = X_train_less_outliers.copy()

# Instantiate the StandardScaler with the selected columns and fit and transform them
scaler = StandardScaler().fit(X_train_less_outliers[non_binary_columns])

X_train_scaled[non_binary_columns] = scaler.transform(X_train_less_outliers[non_binary_columns])

Label Encoder¶

In [35]:
label_encoder = LabelEncoder().fit(y_train)
In [36]:
y_train['Target'] = label_encoder.transform(y_train)
In [37]:
y_train_target = y_train[['Target']]

Model¶

In [38]:
clf1 = RandomForestClassifier(
    n_estimators=271,
    max_depth=36,
    min_samples_split=19,
    min_samples_leaf=11,
    random_state=42
)



clf2 = XGBClassifier(
    learning_rate=0.06048731265187917,
    n_estimators=195,
    max_depth=9,
    subsample= 0.7790510010086706,
    colsample_bytree=0.6220627611238871,
    random_state=42
)

clf3 = AdaBoostClassifier(
    n_estimators=170,
    learning_rate=0.2379351625419417,
    random_state=42
)

clf4 = GradientBoostingClassifier(
    n_estimators=186,
    learning_rate=0.01208563915935721,
    max_depth=9,
    subsample=0.7424149856794916,
    random_state=42
)


clf5 = DecisionTreeClassifier(
    max_depth=16,
    min_samples_split=9,
    min_samples_leaf=4,
    random_state=42
)


# Define the ensemble classifier with the scaler
models = VotingClassifier(estimators=[
    ('rfc', clf1), 
    ('xgb', clf2), 
    ('ada', clf3), 
    ('gbc', clf4), 
    ('dtc', clf5)],
    voting='soft')

# Fit the pipeline to the training data
models.fit(X_train_scaled[['HR','HRV','BR','Posture', 'Activity', 'Impact_Binary','Bounds_Binary']], y_train_target)
Out[38]:
VotingClassifier(estimators=[('rfc',
                              RandomForestClassifier(max_depth=36,
                                                     min_samples_leaf=11,
                                                     min_samples_split=19,
                                                     n_estimators=271,
                                                     random_state=42)),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=0.6220627611238871,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval...
                             ('ada',
                              AdaBoostClassifier(learning_rate=0.2379351625419417,
                                                 n_estimators=170,
                                                 random_state=42)),
                             ('gbc',
                              GradientBoostingClassifier(learning_rate=0.01208563915935721,
                                                         max_depth=9,
                                                         n_estimators=186,
                                                         random_state=42,
                                                         subsample=0.7424149856794916)),
                             ('dtc',
                              DecisionTreeClassifier(max_depth=16,
                                                     min_samples_leaf=4,
                                                     min_samples_split=9,
                                                     random_state=42))],
                 voting='soft')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
VotingClassifier(estimators=[('rfc',
                              RandomForestClassifier(max_depth=36,
                                                     min_samples_leaf=11,
                                                     min_samples_split=19,
                                                     n_estimators=271,
                                                     random_state=42)),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=0.6220627611238871,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval...
                             ('ada',
                              AdaBoostClassifier(learning_rate=0.2379351625419417,
                                                 n_estimators=170,
                                                 random_state=42)),
                             ('gbc',
                              GradientBoostingClassifier(learning_rate=0.01208563915935721,
                                                         max_depth=9,
                                                         n_estimators=186,
                                                         random_state=42,
                                                         subsample=0.7424149856794916)),
                             ('dtc',
                              DecisionTreeClassifier(max_depth=16,
                                                     min_samples_leaf=4,
                                                     min_samples_split=9,
                                                     random_state=42))],
                 voting='soft')
RandomForestClassifier(max_depth=36, min_samples_leaf=11, min_samples_split=19,
                       n_estimators=271, random_state=42)
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.6220627611238871, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.06048731265187917,
              max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=9, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=195, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=42, ...)
AdaBoostClassifier(learning_rate=0.2379351625419417, n_estimators=170,
                   random_state=42)
GradientBoostingClassifier(learning_rate=0.01208563915935721, max_depth=9,
                           n_estimators=186, random_state=42,
                           subsample=0.7424149856794916)
DecisionTreeClassifier(max_depth=16, min_samples_leaf=4, min_samples_split=9,
                       random_state=42)
In [39]:
X
Out[39]:
Time Year Month Weekday Hour HR BR Posture Activity PeakAcceleration ... AvStepImpulse AvStepPeriod JumpFlightTime PeakAccelPhi peakAccelTheta Controled stress stress Before Controled stress After controlled stress Name of the volunteer
0 26.3.2020 9:02:28 2020 March Thursday 9 79 15 16 0,01 0,02 ... 0 0 0 163 -50 1.0 NaN NaN NaN EM
1 26.3.2020 9:02:29 2020 March Thursday 9 79 15 16 0,01 0,1 ... 0 0 0 168 -87 1.0 NaN NaN NaN EM
2 26.3.2020 9:02:30 2020 March Thursday 9 79 15 17 0,01 0,03 ... 0 0 0 164 -47 1.0 NaN NaN NaN EM
3 26.3.2020 9:02:31 2020 March Thursday 9 79 15 17 0,01 0,02 ... 0 0 0 163 -43 1.0 NaN NaN NaN EM
4 26.3.2020 9:02:32 2020 March Thursday 9 79 15 17 0 0,02 ... 0 0 0 163 -50 1.0 NaN NaN NaN EM
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
271793 22.6.2022 6:35:10 2022 June Wednesday 6 84 20 -34 0,04 0,08 ... 2,75 0,351 0 140 91 NaN 1.0 NaN NaN EM
271794 22.6.2022 6:35:11 2022 June Wednesday 6 84 20 -35 0,01 0,03 ... 0 0 0 142 91 NaN 1.0 NaN NaN EM
271795 22.6.2022 6:35:12 2022 June Wednesday 6 84 19 -36 0,04 0,19 ... 0 0 0 133 95 NaN 1.0 NaN NaN EM
271796 22.6.2022 6:35:13 2022 June Wednesday 6 85 19 -38 0,05 0,09 ... 0 0 0 135 95 NaN 1.0 NaN NaN EM
271797 22.6.2022 6:35:14 2022 June Wednesday 6 85 19 -40 0,06 0,1 ... 0 0 0 134 99 NaN 1.0 NaN NaN EM

271797 rows × 35 columns

In [40]:
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats
import dask.diagnostics

# Define the parameter distributions for each classifier
param_distributions = {
    'rfc__n_estimators': stats.randint(100, 500),
    'rfc__max_depth': [None] + list(np.arange(5, 51)),
    'rfc__min_samples_split': stats.randint(2, 20),
    'rfc__min_samples_leaf': stats.randint(1, 20),
    
    'xgb__learning_rate': stats.uniform(0.01, 0.3),
    'xgb__n_estimators': stats.randint(100, 500),
    'xgb__max_depth': stats.randint(2, 10),
    'xgb__subsample': stats.uniform(0.5, 0.5),
    'xgb__colsample_bytree': stats.uniform(0.5, 0.5),
    
    'ada__n_estimators': stats.randint(50, 200),
    'ada__learning_rate': stats.uniform(0.01, 1.0),
    
    'gbc__n_estimators': stats.randint(50, 200),
    'gbc__learning_rate': stats.uniform(0.01, 0.3),
    'gbc__max_depth': stats.randint(2, 10),
    'gbc__subsample': stats.uniform(0.5, 0.5),

    'dtc__max_depth': [None] + list(np.arange(5, 51)),
    'dtc__min_samples_split': stats.randint(2, 20),
    'dtc__min_samples_leaf': stats.randint(1, 20)
}
In [41]:
# # Perform the random search
# search = RandomizedSearchCV(models, param_distributions, n_iter=10, cv=3, verbose=2,  random_state=42)

# with dask.diagnostics.ProgressBar():
#     search.fit(X_train_scaled.sample(n= 10000, random_state= 42)[['HR','HRV','BR','Posture', 'Activity', 'Impact_Binary','Bounds_Binary']], y_train_target.sample(n= 10000, random_state= 42))
In [42]:
best_params_  = {'ada__learning_rate': 0.2379351625419417, 'ada__n_estimators': 170, 'dtc__max_depth': 16, 'dtc__min_samples_leaf': 9, 'dtc__min_samples_split': 4, 'gbc__learning_rate': 0.01208563915935721, 'gbc__max_depth': 9, 'gbc__n_estimators': 186, 'gbc__subsample': 0.7424149856794916, 'rfc__max_depth': 36, 'rfc__min_samples_leaf': 19, 'rfc__min_samples_split': 11, 'rfc__n_estimators': 271, 'xgb__colsample_bytree': 0.6220627611238871, 'xgb__learning_rate': 0.06048731265187917, 'xgb__max_depth': 9, 'xgb__n_estimators': 195, 'xgb__subsample': 0.7790510010086706}

# Check the best parameters
print(best_params_)

#{'ada__learning_rate': 0.2379351625419417, 'ada__n_estimators': 170, 'dtc__max_depth': 16, 'dtc__min_samples_leaf': 9, 'dtc__min_samples_split': 4, 'gbc__learning_rate': 0.01208563915935721, 'gbc__max_depth': 9, 'gbc__n_estimators': 186, 'gbc__subsample': 0.7424149856794916, 'rfc__max_depth': 36, 'rfc__min_samples_leaf': 19, 'rfc__min_samples_split': 11, 'rfc__n_estimators': 271, 'xgb__colsample_bytree': 0.6220627611238871, 'xgb__learning_rate': 0.06048731265187917, 'xgb__max_depth': 9, 'xgb__n_estimators': 195, 'xgb__subsample': 0.7790510010086706}
{'ada__learning_rate': 0.2379351625419417, 'ada__n_estimators': 170, 'dtc__max_depth': 16, 'dtc__min_samples_leaf': 9, 'dtc__min_samples_split': 4, 'gbc__learning_rate': 0.01208563915935721, 'gbc__max_depth': 9, 'gbc__n_estimators': 186, 'gbc__subsample': 0.7424149856794916, 'rfc__max_depth': 36, 'rfc__min_samples_leaf': 19, 'rfc__min_samples_split': 11, 'rfc__n_estimators': 271, 'xgb__colsample_bytree': 0.6220627611238871, 'xgb__learning_rate': 0.06048731265187917, 'xgb__max_depth': 9, 'xgb__n_estimators': 195, 'xgb__subsample': 0.7790510010086706}
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [43]:
y_pred_train = models.predict(X_train_scaled[['HR','HRV','BR','Posture', 'Activity', 'Impact_Binary','Bounds_Binary']])
In [44]:
# Generate a classification report
class_report = classification_report(y_pred_train, y_train_target)
In [45]:
print("Classification Report:")
print(class_report)
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98        57
           1       0.97      0.94      0.95       448
           2       0.97      0.99      0.98      2004
           3       0.65      0.97      0.78      1680
           4       0.95      0.87      0.91      3627
           5       0.99      1.00      1.00     44166
           6       0.98      1.00      0.99      1252
           7       0.86      0.86      0.86      1755
           8       0.76      0.88      0.82      4412
           9       0.96      0.90      0.93     51942
          10       0.99      0.99      0.99      2782
          11       0.99      0.98      0.99      2218
          12       0.93      0.84      0.88      2461
          13       1.00      0.98      0.99      7251
          14       0.95      0.97      0.96      2163
          15       0.80      0.86      0.83      5265
          16       0.93      0.96      0.95     45417
          17       0.88      0.95      0.91      2692
          18       0.72      0.97      0.82       140
          19       0.97      0.98      0.97       371

    accuracy                           0.94    182103
   macro avg       0.91      0.94      0.92    182103
weighted avg       0.95      0.94      0.95    182103

Save model¶

In [46]:
# Save the pipeline to a file
filename = 'ensamble_fitted_model_summerschool2023_stress_HR_BR_HRV.joblib'
joblib.dump(models, filename)
Out[46]:
['ensamble_fitted_model_summerschool2023_stress_HR_BR_HRV.joblib']

Feature Importance¶

In [47]:
# Get feature importances for each individual model
feature_importances = pd.DataFrame()

for name, clf in models.named_estimators_.items():
    if hasattr(clf, 'feature_importances_'):
        feature_importances[name] = clf.feature_importances_

feature_importances.index = X_train_scaled[['HR','HRV','BR','Posture', 'Activity', 'Impact_Binary','Bounds_Binary']].columns
In [48]:
feature_importances
Out[48]:
rfc xgb ada gbc dtc
HR 0.211334 0.063243 0.052941 0.263226 0.180447
HRV 0.193873 0.048195 0.123529 0.205495 0.228883
BR 0.086372 0.026917 0.182353 0.071380 0.076369
Posture 0.240190 0.072173 0.288235 0.239884 0.218524
Activity 0.027014 0.009675 0.023529 0.015290 0.009971
Impact_Binary 0.161471 0.522415 0.205882 0.134348 0.193274
Bounds_Binary 0.079747 0.257381 0.123529 0.070377 0.092531
In [49]:
feature_importances.plot(kind='bar')
Out[49]:
<Axes: >
In [50]:
# Calculate the correlation matrix
corr_matrix = feature_importances.corr()

# Create a heatmap of the correlation matrix
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

X_test & Y_test prep¶

In [51]:
X_test_initial = X_test.copy()
X_test_loaded = data_cleaner.transform(X_test_initial)   #data cleaning
X_test_preprocessed = X_test_loaded.drop(columns = drop) #columns drop
X_test_non_binary = X_test_preprocessed.copy()           #copy df
In [52]:
X_test_Stress_Data = X_test_non_binary.copy()
X_test_Stress_Data = AnyGreaterThanZeroTransformer(columns=['Controled stress',' stress', 'Before Controled stress', 'After controlled stress'],
                                            c_name = 'Stress_Binary')\
                                            .transform(X_test_non_binary)  #binary stress



X_test_Impact_Data = AnyGreaterThanZeroTransformer(columns=['MajorImpacts','MinorImpacts']
                                           ,c_name = 'Impact_Binary')\
                                            .transform(X_test_Stress_Data) #binary impacts

X_test_Av_Data = AnyGreaterThanZeroTransformer(columns=['AvForceDevRate', 'AvStepImpulse', 'AvStepPeriod']
                                           ,c_name = 'Av_Binary')\
                                            .transform(X_test_Impact_Data) #binary av columns 

X_test_Accel_Data = AnyGreaterThanZeroTransformer(columns=['PeakAcceleration']
                                           ,c_name = 'Acc_Binary')\
                                            .transform(X_test_Impact_Data) #binary accel

X_test_Bounds_Data = AnyGreaterThanZeroTransformer(columns=['Bounds']
                                           ,c_name = 'Bounds_Binary')\
                                            .transform(X_test_Accel_Data)  #binary bounds
In [53]:
X_test_outlier = X_test_Bounds_Data.copy()                                              #copying the data before outlier removal
X_test_less_outliers = outlier_replacer.transform(X_test_outlier)                       #outliers removal
X_test_scaled = X_test_less_outliers.copy()                                             #outliers df copy
X_test_scaled[non_binary_columns] = scaler.transform(X_test_scaled[non_binary_columns]) # replace the og df
In [54]:
y_test['Target'] = label_encoder.transform(y_test[['Activities Detailed']]) #transforming labels
y_test_target = y_test[['Target']]                 #predictor class

Model Loading¶

In [55]:
loaded_model = joblib.load('ensamble_fitted_model_summerschool2023_stress_HR_BR_HRV.joblib') 
In [56]:
loaded_model
Out[56]:
VotingClassifier(estimators=[('rfc',
                              RandomForestClassifier(max_depth=36,
                                                     min_samples_leaf=11,
                                                     min_samples_split=19,
                                                     n_estimators=271,
                                                     random_state=42)),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=0.6220627611238871,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval...
                             ('ada',
                              AdaBoostClassifier(learning_rate=0.2379351625419417,
                                                 n_estimators=170,
                                                 random_state=42)),
                             ('gbc',
                              GradientBoostingClassifier(learning_rate=0.01208563915935721,
                                                         max_depth=9,
                                                         n_estimators=186,
                                                         random_state=42,
                                                         subsample=0.7424149856794916)),
                             ('dtc',
                              DecisionTreeClassifier(max_depth=16,
                                                     min_samples_leaf=4,
                                                     min_samples_split=9,
                                                     random_state=42))],
                 voting='soft')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
VotingClassifier(estimators=[('rfc',
                              RandomForestClassifier(max_depth=36,
                                                     min_samples_leaf=11,
                                                     min_samples_split=19,
                                                     n_estimators=271,
                                                     random_state=42)),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=0.6220627611238871,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval...
                             ('ada',
                              AdaBoostClassifier(learning_rate=0.2379351625419417,
                                                 n_estimators=170,
                                                 random_state=42)),
                             ('gbc',
                              GradientBoostingClassifier(learning_rate=0.01208563915935721,
                                                         max_depth=9,
                                                         n_estimators=186,
                                                         random_state=42,
                                                         subsample=0.7424149856794916)),
                             ('dtc',
                              DecisionTreeClassifier(max_depth=16,
                                                     min_samples_leaf=4,
                                                     min_samples_split=9,
                                                     random_state=42))],
                 voting='soft')
RandomForestClassifier(max_depth=36, min_samples_leaf=11, min_samples_split=19,
                       n_estimators=271, random_state=42)
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.6220627611238871, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.06048731265187917,
              max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=9, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=195, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=42, ...)
AdaBoostClassifier(learning_rate=0.2379351625419417, n_estimators=170,
                   random_state=42)
GradientBoostingClassifier(learning_rate=0.01208563915935721, max_depth=9,
                           n_estimators=186, random_state=42,
                           subsample=0.7424149856794916)
DecisionTreeClassifier(max_depth=16, min_samples_leaf=4, min_samples_split=9,
                       random_state=42)
In [57]:
y_test_pred = loaded_model.predict(X_test_scaled[['HR','HRV','BR','Posture', 'Activity', 'Impact_Binary','Bounds_Binary']])
In [58]:
# Generate a classification report
test_class_report = classification_report(y_test_pred,y_test_target )
In [59]:
print("Classification Report:")
print(test_class_report)
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.96      0.95        28
           1       0.92      0.92      0.92       213
           2       0.94      0.98      0.96       969
           3       0.58      0.96      0.72       745
           4       0.93      0.85      0.89      1781
           5       0.99      0.99      0.99     21762
           6       0.97      0.99      0.98       617
           7       0.80      0.83      0.82       829
           8       0.72      0.84      0.78      2161
           9       0.94      0.87      0.91     25781
          10       0.98      0.99      0.98      1363
          11       0.96      0.95      0.96      1095
          12       0.85      0.81      0.83      1144
          13       0.99      0.97      0.98      3601
          14       0.93      0.96      0.95      1058
          15       0.75      0.81      0.78      2586
          16       0.92      0.95      0.93     22428
          17       0.80      0.89      0.85      1299
          18       0.53      0.71      0.61        70
          19       0.88      0.98      0.93       164

    accuracy                           0.93     89694
   macro avg       0.87      0.91      0.88     89694
weighted avg       0.93      0.93      0.93     89694

In [60]:
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.colors as colors

# Define a colormap where zero values are white
cmap = colors.ListedColormap(['white', 'lightpink', 'lightblue'])

# Define the bounds for each color
bounds = [0,1,2,3]

# Create a normalization based on the bounds
norm = colors.BoundaryNorm(bounds, cmap.N)

# Compute the confusion matrix
cm = confusion_matrix(y_test_target, y_test_pred)

# Get sorted labels
labels = sorted(y_test['Activities Detailed'].unique())

# Create the heatmap
plt.figure(figsize=(10,7))
sns.heatmap(cm, annot=False, cmap=cmap, norm=norm, xticklabels=labels, yticklabels=labels)

plt.title('Confusion Matrix based on Frequencies', size=16)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')

plt.show()
In [61]:
# Generate a classification report
class_report = classification_report(y_pred_train, y_train_target, output_dict=True)

test_class_report = classification_report(y_test_pred,y_test_target, output_dict=True) 

# Create dataframes
df_test = pd.DataFrame(test_class_report).transpose()
df = pd.DataFrame(class_report).transpose()

# Add suffixes to the column names
df_test.columns = [str(col) + '_Test' for col in df_test.columns]
df.columns = [str(col) for col in df.columns]

# Concatenate the dataframes
result_df = pd.concat([df, df_test], axis=1).reset_index(drop=False).iloc[:-3,:]
macro_df = pd.concat([df, df_test], axis=1).reset_index(drop=False).iloc[-3:,:]
In [62]:
result_df['index'] = result_df['index'].astype(int)
In [63]:
result_df['Label'] = label_encoder.inverse_transform(result_df['index'])
In [64]:
result_df
Out[64]:
index precision recall f1-score support precision_Test recall_Test f1-score_Test support_Test Label
0 0 0.966102 1.000000 0.982759 57.0 0.931034 0.964286 0.947368 28.0 Coherent Breathing
1 1 0.974419 0.935268 0.954442 448.0 0.924528 0.920188 0.922353 213.0 Play Quitar
2 2 0.966423 0.991018 0.978566 2004.0 0.940711 0.982456 0.961131 969.0 cognitive workout
3 3 0.647551 0.967857 0.775948 1680.0 0.576395 0.957047 0.719475 745.0 competition
4 4 0.952439 0.866832 0.907621 3627.0 0.927429 0.846715 0.885236 1781.0 creative writing
5 5 0.991900 0.998121 0.995001 44166.0 0.988853 0.994670 0.991753 21762.0 dream
6 6 0.984252 0.998403 0.991277 1252.0 0.974400 0.987034 0.980676 617.0 leisure
7 7 0.861206 0.862678 0.861941 1755.0 0.799076 0.834741 0.816519 829.0 meeting with client
8 8 0.764510 0.883726 0.819807 4412.0 0.723726 0.841277 0.778087 2161.0 negotiation
9 9 0.957183 0.895210 0.925160 51942.0 0.940739 0.873124 0.905671 25781.0 office work
10 10 0.990298 0.990654 0.990476 2782.0 0.980306 0.986060 0.983175 1363.0 phone calls
11 11 0.992230 0.978810 0.985474 2218.0 0.963788 0.947945 0.955801 1095.0 play piano
12 12 0.925460 0.837464 0.879266 2461.0 0.847767 0.812937 0.829987 1144.0 presentation
13 13 0.995385 0.981658 0.988474 7251.0 0.987791 0.966121 0.976836 3601.0 public speaking
14 14 0.949458 0.972723 0.960950 2163.0 0.931256 0.960302 0.945556 1058.0 reading book
15 15 0.804282 0.863248 0.832723 5265.0 0.748473 0.805491 0.775936 2586.0 sales
16 16 0.932961 0.963384 0.947928 45417.0 0.918788 0.946317 0.932349 22428.0 sport
17 17 0.875300 0.949108 0.910711 2692.0 0.804590 0.890685 0.845451 1299.0 training
18 18 0.715789 0.971429 0.824242 140.0 0.531915 0.714286 0.609756 70.0 walking meeting
19 19 0.967914 0.975741 0.971812 371.0 0.875000 0.981707 0.925287 164.0 writing study
In [65]:
result_df.iloc[:-3,:][['precision','precision_Test']].plot(kind='bar')
Out[65]:
<Axes: >
In [ ]: