import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import joblib
import warnings
warnings.filterwarnings('ignore')
# Importing the pandas library and reading the CSV file 'Zephyr Data Emo with new categories-Copy1.csv'
# The 'engine' parameter is set to 'python' to handle non-default delimiter options.
# The 'delimiter' parameter is set to ';' to specify that the data in the CSV file is delimited by semicolons.
raw_core = pd.read_csv('Zephyr Data Emo with new categories-Copy1.csv', engine='python', delimiter=';')
# Get the value counts for the 'Activities Detailed' column
value_counts = raw_core['Activities Detailed'].value_counts()
# Create the bar plot
plt.figure(figsize=(10, 6)) # Optional: Adjust the figure size if needed
plt.bar(value_counts.index, value_counts.values)
# Optional: Rotate the x-axis labels if they are too long
plt.xticks(rotation=90)
# Optional: Add labels and title
plt.xlabel('Activities Detailed')
plt.ylabel('Count')
plt.title('Value Counts of Activities Detailed')
# Show the plot
plt.tight_layout() # Optional: To avoid cutoff of labels
plt.show()
# Creating a new DataFrame 'raw' by filtering out rows where the 'Activities Detailed' column is not equal to 'papper work'.
# This step is essentially excluding rows with the value 'papper work' in the 'Activities Detailed' column which has only 1 entry.
raw = raw_core[raw_core['Activities Detailed'] != 'papper work']
# Assuming 'raw' is a DataFrame containing the data.
# Extracting the 'Activities Detailed' column and assigning it to the 'y' variable.
y = raw[['Activities Detailed']]
# Creating a new DataFrame 'X' by dropping the 'Activities Detailed' and 'Activities' columns from the original DataFrame 'raw'.
# 'X' will contain the remaining features (columns) of the data.
X = raw.drop(columns=['Activities Detailed', 'Activities'])
sns.set_palette("Set1", desat=1)
# Create a FacetGrid
facetgrid = sns.FacetGrid(raw, hue='Activities Detailed', height=6, aspect=2)
# Map the distribution plot
facetgrid.map(sns.kdeplot, 'HR', fill=True)
# Add legend
facetgrid.add_legend()
# Annotations
plt.annotate("dream", xy=(55, 0.06), xytext=(60, 0.08),
xycoords='data', textcoords='data',
va='center', ha='right',
arrowprops=dict(arrowstyle="simple", connectionstyle="arc3,rad=0.1"))
plt.annotate("Competition", xy=(175, 0.005), xytext=(180, 0.02),
xycoords='data', textcoords='data',
va='center', ha='left',
arrowprops=dict(arrowstyle="simple", connectionstyle="arc3,rad=0.1"))
plt.annotate("training", xy=(100, 0.04), xytext=(110, 0.05),
xycoords='data', textcoords='data',
va='center', ha='left',
arrowprops=dict(arrowstyle="simple", connectionstyle="arc3,rad=0.1"))
plt.annotate("training", xy=(100, 0.04), xytext=(110, 0.05),
xycoords='data', textcoords='data',
va='center', ha='left',
arrowprops=dict(arrowstyle="simple", connectionstyle="arc3,rad=0.1"))
plt.annotate("public speaking", xy=(120, 0.02), xytext=(125, 0.03),
xycoords='data', textcoords='data',
va='center', ha='left',
arrowprops=dict(arrowstyle="simple", connectionstyle="arc3,rad=0.1"))
# Show the plot
plt.show()
groups = raw.groupby('Activities Detailed')
# Create an empty dictionary to store the smaller dataframes
smaller_dfs = {}
# Iterate over each group and create separate dataframes
for label, group_df in groups:
smaller_dfs[label] = group_df.copy()
sns.set_palette("Set1", desat=1)
# Iterate over each label and corresponding dataframe
for label, df in smaller_dfs.items():
# Create a new plot for each dataframe
plt.figure(figsize=(10, 6))
# Create a FacetGrid for the current dataframe
facetgrid = sns.FacetGrid(df, hue='Activities Detailed', height=3, aspect=2)
# Map the distribution plot
facetgrid.map(sns.kdeplot, 'HR', fill=True)
# Add legend
facetgrid.add_legend()
mean_val = df['HR'].mean()
min_val = df['HR'].min()
max_val = df['HR'].max()
# Add text annotations for mean, min, and max values
plt.text(0.6, 0.8, f"Mean: {mean_val:.2f}", transform=plt.gca().transAxes)
plt.text(0.6, 0.7, f"Min: {min_val:.2f}", transform=plt.gca().transAxes)
plt.text(0.6, 0.6, f"Max: {max_val:.2f}", transform=plt.gca().transAxes)
# Set title
plt.title(label)
# Show the plot
plt.show();
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
<Figure size 1000x600 with 0 Axes>
correlation_matrix = raw.drop(columns = ['Controled stress', ' stress',
'Before Controled stress', 'After controlled stress','Jumps','JumpFlightTime']).select_dtypes(include=['float64', 'int64']).corr().round(2)
# Create the heatmap
plt.figure(figsize=(18, 8)) # Adjust the figure size as needed
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
# Set the plot title
plt.title('Pairwise Correlation Heatmap')
# Show the plot
plt.show()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify = y)
class DataCleanerTransformer:
def __init__(self):
# Define the columns that need to be converted to float
self.float_columns = ['Activity', 'PeakAcceleration', 'ECGAmplitude', 'ECGNoise', 'CoreTemp',
'AvStepPeriod', 'AvForceDevRate', 'AvStepImpulse']
def fit(self, data):
# Nothing to fit in this case, so we return self.
return self
def transform(self, data):
# Convert selected columns to float
data[self.float_columns] = data[self.float_columns].apply(lambda x: x.str.replace(',', '.').astype(float))
# Convert 'Time' column to pandas datetime format
data['Time_New'] = pd.to_datetime(data['Time'], format="%d.%m.%Y %H:%M:%S")
# Fill NaN values with 0
data = data.fillna(0)
return data
def fit_transform(self, data):
# Fit the transformer (in this case, there's nothing to fit)
self.fit(data)
# Transform the data using the transform method
return self.transform(data)
# Example usage:
# Instantiate the DataCleanerTransformer class
data_cleaner = DataCleanerTransformer()
X_train_loaded = data_cleaner.fit_transform(X_train)
drop = [
#The model overfits and remmember when what activities were done so they are removed
'Year', 'Month', 'Weekday', 'Time', 'Time_New','Date','Hour',
#We have few participants so the model is good at overfitting who did what
'Name of the volunteer',
#These are health KPIs of how the Holter machine works and aren't dependant on activity.
'BRAmplitude','ECGAmplitude', 'HRConfidence',
#These measure how the machine works and whether it detects noise. This is what we model.
'ECGNoise','Jumps','JumpFlightTime'
#Stress is mannualy inputted
# ,'Controled stress',' stress', 'Before Controled stress', 'After controlled stress'
]
X_preprocessed = X_train_loaded.drop(columns = drop)
ratios = (X_preprocessed.nunique() / X_preprocessed.count()).sort_values(ascending=True)
# Create a bar plot to visualize the ratios
plt.figure(figsize=(10, 6))
plt.barh(ratios.index, ratios.values)
plt.xlabel('Ratio of Unique Values to Total Count')
plt.ylabel('Columns')
plt.title('Ratio of Unique Values to Total Count for each Column')
plt.tight_layout()
plt.show()
import matplotlib.pyplot as plt
# Assuming you have already calculated the ratio of variance to mean
ratios = (X_preprocessed.var() / X_preprocessed.mean()).sort_values(ascending=True)
# Create a bar plot to visualize the ratios
plt.figure(figsize=(10, 6))
# Color negative values in red, and non-negative values in the default color (blue)
colors = ['red' if value < 0 else 'blue' for value in ratios.values]
plt.barh(ratios.index, ratios.values, color=colors)
plt.xlabel('Ratio of Variance to Mean')
plt.ylabel('Columns')
plt.title('Ratio of Variance to Mean for each Column')
plt.tight_layout()
plt.show()
import matplotlib.pyplot as plt
# Assuming you have already calculated the ratio of unique values to total count and variance to mean
unique_ratios = (X_preprocessed.nunique() / X_preprocessed.count()).sort_values(ascending=True)
variance_ratios = (X_preprocessed.var() / X_preprocessed.mean()).sort_values(ascending=True)
# Create a scatter plot to combine the two ratios
plt.figure(figsize=(10, 6))
# Plot the scatter plot with unique-to-total count ratios on the x-axis and variance-to-mean ratios on the y-axis
plt.scatter(unique_ratios.values, variance_ratios.values, color=colors)
# Annotate each point with the column name next to the data point
for i, col in enumerate(unique_ratios.index):
plt.annotate(col, xy=(unique_ratios.values[i], variance_ratios.values[i]),
xytext=(5, 0), textcoords='offset points')
# Set log scales for both x and y axes
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Unique-to-Total Count Ratio')
plt.ylabel('Variance-to-Mean Ratio')
plt.title('Unique-to-Total Count vs. Variance-to-Mean Ratio for each Column (Log Scales)')
plt.tight_layout()
plt.show()
class AnyGreaterThanZeroTransformer:
def __init__(self, columns, c_name):
self.columns = columns
self.c_name = c_name
def fit(self, X, y=None):
return self
def transform(self, X):
# Concatenate the names of the specified columns into a new column
X[f'{self.c_name}'] = (X[self.columns] > 0).any(axis=1).astype(int)
# Drop the original specified columns as they are no longer needed
X.drop(columns=self.columns, inplace=True)
return X
def fit_transform(self, X, y=None):
return self.fit(X).transform(X)
X_non_binary = X_preprocessed.copy()
# Plot histograms for the specified columns
plt.figure(figsize=(10, 6))
X_preprocessed.hist(column=['Controled stress', ' stress', 'Before Controled stress', 'After controlled stress'], bins=10)
plt.xlabel('Values')
plt.ylabel('Frequency')
# plt.title('Histograms for the Specified Columns')
plt.tight_layout()
plt.show()
# Plot boxplots for the specified columns
plt.figure(figsize=(10, 6))
sns.boxplot(data=X_preprocessed[['Controled stress', ' stress', 'Before Controled stress', 'After controlled stress']])
plt.xlabel('Columns')
plt.ylabel('Values')
# plt.title('Boxplots for the Specified Columns')
plt.tight_layout()
plt.show()
<Figure size 1000x600 with 0 Axes>
# Use the fit_transform method to create the new column 'AnyGreaterThanZero'
Stress_Data = X_non_binary.copy()
Stress_Data = AnyGreaterThanZeroTransformer(columns=['Controled stress',' stress', 'Before Controled stress', 'After controlled stress'],
c_name = 'Stress_Binary')\
.fit_transform(X_non_binary)
# Plot histograms for the specified columns
plt.figure(figsize=(10, 6))
X_preprocessed.hist(column=['MajorImpacts','MinorImpacts'], bins=10)
plt.xlabel('Values')
plt.ylabel('Frequency')
# plt.title('Histograms for the Specified Columns')
plt.tight_layout()
plt.show()
# Plot boxplots for the specified columns
plt.figure(figsize=(10, 6))
sns.boxplot(data=X_preprocessed[['MajorImpacts','MinorImpacts']])
plt.xlabel('Columns')
plt.ylabel('Values')
# plt.title('Boxplots for the Specified Columns')
plt.tight_layout()
plt.show()
<Figure size 1000x600 with 0 Axes>
Impact_Data = AnyGreaterThanZeroTransformer(columns=['MajorImpacts','MinorImpacts']
,c_name = 'Impact_Binary')\
.fit_transform(Stress_Data)
# Plot histograms for the specified columns
plt.figure(figsize=(10, 6))
X_preprocessed.hist(column=['AvForceDevRate', 'AvStepImpulse', 'AvStepPeriod'], bins=10)
plt.xlabel('Values')
plt.ylabel('Frequency')
# plt.title('Histograms for the Specified Columns')
plt.tight_layout()
plt.show()
# Plot boxplots for the specified columns
plt.figure(figsize=(10, 6))
sns.boxplot(data=X_preprocessed[['AvForceDevRate', 'AvStepImpulse', 'AvStepPeriod']])
plt.xlabel('Columns')
plt.ylabel('Values')
# plt.title('Boxplots for the Specified Columns')
plt.tight_layout()
plt.show()
<Figure size 1000x600 with 0 Axes>
Av_Data = AnyGreaterThanZeroTransformer(columns=['AvForceDevRate', 'AvStepImpulse', 'AvStepPeriod']
,c_name = 'Av_Binary')\
.fit_transform(Stress_Data)
# Plot histograms for the specified columns
plt.figure(figsize=(10, 6))
X_preprocessed.hist(column=['PeakAcceleration', 'PeakAccelPhi', 'peakAccelTheta'], bins=10)
plt.xlabel('Values')
plt.ylabel('Frequency')
# plt.title('Histograms for the Specified Columns')
plt.tight_layout()
plt.show()
# Plot boxplots for the specified columns
plt.figure(figsize=(10, 6))
sns.boxplot(data=X_preprocessed[['PeakAcceleration', 'PeakAccelPhi', 'peakAccelTheta']])
plt.xlabel('Columns')
plt.ylabel('Values')
# plt.title('Boxplots for the Specified Columns')
plt.tight_layout()
plt.show()
<Figure size 1000x600 with 0 Axes>
Accel_Data = AnyGreaterThanZeroTransformer(columns=['PeakAcceleration']
,c_name = 'Acc_Binary')\
.fit_transform(Stress_Data)
# Plot histograms for the specified columns
plt.figure(figsize=(10, 6))
X_preprocessed.hist(column=['Bounds', 'RunSteps'], bins=10)
plt.xlabel('Values')
plt.ylabel('Frequency')
# plt.title('Histograms for the Specified Columns')
plt.tight_layout()
plt.show()
# Plot boxplots for the specified columns
plt.figure(figsize=(10, 6))
sns.boxplot(data=X_preprocessed[['Bounds', 'RunSteps']])
plt.xlabel('Columns')
plt.ylabel('Values')
# plt.title('Boxplots for the Specified Columns')
plt.tight_layout()
plt.show()
<Figure size 1000x600 with 0 Axes>
Bounds_Data = AnyGreaterThanZeroTransformer(columns=['Bounds']
,c_name = 'Bounds_Binary')\
.fit_transform(Stress_Data)
X_train_outlier = Bounds_Data.copy()
# Identify non-binary columns
non_binary_columns = ['HR','HRV','BR','Posture', 'Activity']
binary_clumns = X_train_outlier.drop(columns = non_binary_columns).columns.tolist()
# Plot boxplots for the specified columns
plt.figure(figsize=(10, 6))
sns.boxplot(data=X_train_outlier[non_binary_columns])
plt.xlabel('Columns')
plt.ylabel('Values')
# plt.title('Boxplots for the Specified Columns')
plt.tight_layout()
plt.show()
class OutlierReplacer:
def __init__(self, columns):
self.columns = columns
self.iqr_ranges = {}
def fit(self, X, y=None):
for column in self.columns:
Q1 = X[column].quantile(0.25)
Q3 = X[column].quantile(0.75)
IQR = Q3 - Q1
self.iqr_ranges[column] = (Q1 - 1.5 * IQR, Q3 + 1.5 * IQR)
return self
def transform(self, X):
X_transformed = X.copy()
for column in self.columns:
if column in self.iqr_ranges:
outliers = (X_transformed[column] < self.iqr_ranges[column][0]) | (X_transformed[column] > self.iqr_ranges[column][1])
detected_outliers = X_transformed[outliers]
clean_data_iqr = X_transformed[~outliers].fillna(0)
mean_value = clean_data_iqr[column].mean()
noise = np.random.normal(0, 0.1, len(detected_outliers))
mean_value_with_noise = noise + mean_value
X_transformed.loc[outliers, column] = mean_value_with_noise
return X_transformed
def fit_transform(self, X, y=None):
return self.fit(X).transform(X)
# Instantiate the OutlierReplacer class and fit_transform on training data
outlier_replacer = OutlierReplacer(columns=non_binary_columns).fit(X_train_outlier)
X_train_less_outliers = outlier_replacer.transform(X_train_outlier)
scaler = StandardScaler().fit(X_train_less_outliers)
# Make a copy of the original DataFrame
X_train_scaled = X_train_less_outliers.copy()
# Instantiate the StandardScaler with the selected columns and fit and transform them
scaler = StandardScaler().fit(X_train_less_outliers[non_binary_columns])
X_train_scaled[non_binary_columns] = scaler.transform(X_train_less_outliers[non_binary_columns])
label_encoder = LabelEncoder().fit(y_train)
y_train['Target'] = label_encoder.transform(y_train)
y_train_target = y_train[['Target']]
clf1 = RandomForestClassifier(
n_estimators=271,
max_depth=36,
min_samples_split=19,
min_samples_leaf=11,
random_state=42
)
clf2 = XGBClassifier(
learning_rate=0.06048731265187917,
n_estimators=195,
max_depth=9,
subsample= 0.7790510010086706,
colsample_bytree=0.6220627611238871,
random_state=42
)
clf3 = AdaBoostClassifier(
n_estimators=170,
learning_rate=0.2379351625419417,
random_state=42
)
clf4 = GradientBoostingClassifier(
n_estimators=186,
learning_rate=0.01208563915935721,
max_depth=9,
subsample=0.7424149856794916,
random_state=42
)
clf5 = DecisionTreeClassifier(
max_depth=16,
min_samples_split=9,
min_samples_leaf=4,
random_state=42
)
# Define the ensemble classifier with the scaler
models = VotingClassifier(estimators=[
('rfc', clf1),
('xgb', clf2),
('ada', clf3),
('gbc', clf4),
('dtc', clf5)],
voting='soft')
# Fit the pipeline to the training data
models.fit(X_train_scaled[['HR','HRV','BR','Posture', 'Activity', 'Impact_Binary','Bounds_Binary']], y_train_target)
VotingClassifier(estimators=[('rfc', RandomForestClassifier(max_depth=36, min_samples_leaf=11, min_samples_split=19, n_estimators=271, random_state=42)), ('xgb', XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=0.6220627611238871, early_stopping_rounds=None, enable_categorical=False, eval... ('ada', AdaBoostClassifier(learning_rate=0.2379351625419417, n_estimators=170, random_state=42)), ('gbc', GradientBoostingClassifier(learning_rate=0.01208563915935721, max_depth=9, n_estimators=186, random_state=42, subsample=0.7424149856794916)), ('dtc', DecisionTreeClassifier(max_depth=16, min_samples_leaf=4, min_samples_split=9, random_state=42))], voting='soft')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
VotingClassifier(estimators=[('rfc', RandomForestClassifier(max_depth=36, min_samples_leaf=11, min_samples_split=19, n_estimators=271, random_state=42)), ('xgb', XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=0.6220627611238871, early_stopping_rounds=None, enable_categorical=False, eval... ('ada', AdaBoostClassifier(learning_rate=0.2379351625419417, n_estimators=170, random_state=42)), ('gbc', GradientBoostingClassifier(learning_rate=0.01208563915935721, max_depth=9, n_estimators=186, random_state=42, subsample=0.7424149856794916)), ('dtc', DecisionTreeClassifier(max_depth=16, min_samples_leaf=4, min_samples_split=9, random_state=42))], voting='soft')
RandomForestClassifier(max_depth=36, min_samples_leaf=11, min_samples_split=19, n_estimators=271, random_state=42)
XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=0.6220627611238871, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=0.06048731265187917, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=9, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=195, n_jobs=None, num_parallel_tree=None, predictor=None, random_state=42, ...)
AdaBoostClassifier(learning_rate=0.2379351625419417, n_estimators=170, random_state=42)
GradientBoostingClassifier(learning_rate=0.01208563915935721, max_depth=9, n_estimators=186, random_state=42, subsample=0.7424149856794916)
DecisionTreeClassifier(max_depth=16, min_samples_leaf=4, min_samples_split=9, random_state=42)
X
Time | Year | Month | Weekday | Hour | HR | BR | Posture | Activity | PeakAcceleration | ... | AvStepImpulse | AvStepPeriod | JumpFlightTime | PeakAccelPhi | peakAccelTheta | Controled stress | stress | Before Controled stress | After controlled stress | Name of the volunteer | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 26.3.2020 9:02:28 | 2020 | March | Thursday | 9 | 79 | 15 | 16 | 0,01 | 0,02 | ... | 0 | 0 | 0 | 163 | -50 | 1.0 | NaN | NaN | NaN | EM |
1 | 26.3.2020 9:02:29 | 2020 | March | Thursday | 9 | 79 | 15 | 16 | 0,01 | 0,1 | ... | 0 | 0 | 0 | 168 | -87 | 1.0 | NaN | NaN | NaN | EM |
2 | 26.3.2020 9:02:30 | 2020 | March | Thursday | 9 | 79 | 15 | 17 | 0,01 | 0,03 | ... | 0 | 0 | 0 | 164 | -47 | 1.0 | NaN | NaN | NaN | EM |
3 | 26.3.2020 9:02:31 | 2020 | March | Thursday | 9 | 79 | 15 | 17 | 0,01 | 0,02 | ... | 0 | 0 | 0 | 163 | -43 | 1.0 | NaN | NaN | NaN | EM |
4 | 26.3.2020 9:02:32 | 2020 | March | Thursday | 9 | 79 | 15 | 17 | 0 | 0,02 | ... | 0 | 0 | 0 | 163 | -50 | 1.0 | NaN | NaN | NaN | EM |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
271793 | 22.6.2022 6:35:10 | 2022 | June | Wednesday | 6 | 84 | 20 | -34 | 0,04 | 0,08 | ... | 2,75 | 0,351 | 0 | 140 | 91 | NaN | 1.0 | NaN | NaN | EM |
271794 | 22.6.2022 6:35:11 | 2022 | June | Wednesday | 6 | 84 | 20 | -35 | 0,01 | 0,03 | ... | 0 | 0 | 0 | 142 | 91 | NaN | 1.0 | NaN | NaN | EM |
271795 | 22.6.2022 6:35:12 | 2022 | June | Wednesday | 6 | 84 | 19 | -36 | 0,04 | 0,19 | ... | 0 | 0 | 0 | 133 | 95 | NaN | 1.0 | NaN | NaN | EM |
271796 | 22.6.2022 6:35:13 | 2022 | June | Wednesday | 6 | 85 | 19 | -38 | 0,05 | 0,09 | ... | 0 | 0 | 0 | 135 | 95 | NaN | 1.0 | NaN | NaN | EM |
271797 | 22.6.2022 6:35:14 | 2022 | June | Wednesday | 6 | 85 | 19 | -40 | 0,06 | 0,1 | ... | 0 | 0 | 0 | 134 | 99 | NaN | 1.0 | NaN | NaN | EM |
271797 rows × 35 columns
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats
import dask.diagnostics
# Define the parameter distributions for each classifier
param_distributions = {
'rfc__n_estimators': stats.randint(100, 500),
'rfc__max_depth': [None] + list(np.arange(5, 51)),
'rfc__min_samples_split': stats.randint(2, 20),
'rfc__min_samples_leaf': stats.randint(1, 20),
'xgb__learning_rate': stats.uniform(0.01, 0.3),
'xgb__n_estimators': stats.randint(100, 500),
'xgb__max_depth': stats.randint(2, 10),
'xgb__subsample': stats.uniform(0.5, 0.5),
'xgb__colsample_bytree': stats.uniform(0.5, 0.5),
'ada__n_estimators': stats.randint(50, 200),
'ada__learning_rate': stats.uniform(0.01, 1.0),
'gbc__n_estimators': stats.randint(50, 200),
'gbc__learning_rate': stats.uniform(0.01, 0.3),
'gbc__max_depth': stats.randint(2, 10),
'gbc__subsample': stats.uniform(0.5, 0.5),
'dtc__max_depth': [None] + list(np.arange(5, 51)),
'dtc__min_samples_split': stats.randint(2, 20),
'dtc__min_samples_leaf': stats.randint(1, 20)
}
# # Perform the random search
# search = RandomizedSearchCV(models, param_distributions, n_iter=10, cv=3, verbose=2, random_state=42)
# with dask.diagnostics.ProgressBar():
# search.fit(X_train_scaled.sample(n= 10000, random_state= 42)[['HR','HRV','BR','Posture', 'Activity', 'Impact_Binary','Bounds_Binary']], y_train_target.sample(n= 10000, random_state= 42))
best_params_ = {'ada__learning_rate': 0.2379351625419417, 'ada__n_estimators': 170, 'dtc__max_depth': 16, 'dtc__min_samples_leaf': 9, 'dtc__min_samples_split': 4, 'gbc__learning_rate': 0.01208563915935721, 'gbc__max_depth': 9, 'gbc__n_estimators': 186, 'gbc__subsample': 0.7424149856794916, 'rfc__max_depth': 36, 'rfc__min_samples_leaf': 19, 'rfc__min_samples_split': 11, 'rfc__n_estimators': 271, 'xgb__colsample_bytree': 0.6220627611238871, 'xgb__learning_rate': 0.06048731265187917, 'xgb__max_depth': 9, 'xgb__n_estimators': 195, 'xgb__subsample': 0.7790510010086706}
# Check the best parameters
print(best_params_)
#{'ada__learning_rate': 0.2379351625419417, 'ada__n_estimators': 170, 'dtc__max_depth': 16, 'dtc__min_samples_leaf': 9, 'dtc__min_samples_split': 4, 'gbc__learning_rate': 0.01208563915935721, 'gbc__max_depth': 9, 'gbc__n_estimators': 186, 'gbc__subsample': 0.7424149856794916, 'rfc__max_depth': 36, 'rfc__min_samples_leaf': 19, 'rfc__min_samples_split': 11, 'rfc__n_estimators': 271, 'xgb__colsample_bytree': 0.6220627611238871, 'xgb__learning_rate': 0.06048731265187917, 'xgb__max_depth': 9, 'xgb__n_estimators': 195, 'xgb__subsample': 0.7790510010086706}
{'ada__learning_rate': 0.2379351625419417, 'ada__n_estimators': 170, 'dtc__max_depth': 16, 'dtc__min_samples_leaf': 9, 'dtc__min_samples_split': 4, 'gbc__learning_rate': 0.01208563915935721, 'gbc__max_depth': 9, 'gbc__n_estimators': 186, 'gbc__subsample': 0.7424149856794916, 'rfc__max_depth': 36, 'rfc__min_samples_leaf': 19, 'rfc__min_samples_split': 11, 'rfc__n_estimators': 271, 'xgb__colsample_bytree': 0.6220627611238871, 'xgb__learning_rate': 0.06048731265187917, 'xgb__max_depth': 9, 'xgb__n_estimators': 195, 'xgb__subsample': 0.7790510010086706}
y_pred_train = models.predict(X_train_scaled[['HR','HRV','BR','Posture', 'Activity', 'Impact_Binary','Bounds_Binary']])
# Generate a classification report
class_report = classification_report(y_pred_train, y_train_target)
print("Classification Report:")
print(class_report)
Classification Report: precision recall f1-score support 0 0.97 1.00 0.98 57 1 0.97 0.94 0.95 448 2 0.97 0.99 0.98 2004 3 0.65 0.97 0.78 1680 4 0.95 0.87 0.91 3627 5 0.99 1.00 1.00 44166 6 0.98 1.00 0.99 1252 7 0.86 0.86 0.86 1755 8 0.76 0.88 0.82 4412 9 0.96 0.90 0.93 51942 10 0.99 0.99 0.99 2782 11 0.99 0.98 0.99 2218 12 0.93 0.84 0.88 2461 13 1.00 0.98 0.99 7251 14 0.95 0.97 0.96 2163 15 0.80 0.86 0.83 5265 16 0.93 0.96 0.95 45417 17 0.88 0.95 0.91 2692 18 0.72 0.97 0.82 140 19 0.97 0.98 0.97 371 accuracy 0.94 182103 macro avg 0.91 0.94 0.92 182103 weighted avg 0.95 0.94 0.95 182103
# Save the pipeline to a file
filename = 'ensamble_fitted_model_summerschool2023_stress_HR_BR_HRV.joblib'
joblib.dump(models, filename)
['ensamble_fitted_model_summerschool2023_stress_HR_BR_HRV.joblib']
# Get feature importances for each individual model
feature_importances = pd.DataFrame()
for name, clf in models.named_estimators_.items():
if hasattr(clf, 'feature_importances_'):
feature_importances[name] = clf.feature_importances_
feature_importances.index = X_train_scaled[['HR','HRV','BR','Posture', 'Activity', 'Impact_Binary','Bounds_Binary']].columns
feature_importances
rfc | xgb | ada | gbc | dtc | |
---|---|---|---|---|---|
HR | 0.211334 | 0.063243 | 0.052941 | 0.263226 | 0.180447 |
HRV | 0.193873 | 0.048195 | 0.123529 | 0.205495 | 0.228883 |
BR | 0.086372 | 0.026917 | 0.182353 | 0.071380 | 0.076369 |
Posture | 0.240190 | 0.072173 | 0.288235 | 0.239884 | 0.218524 |
Activity | 0.027014 | 0.009675 | 0.023529 | 0.015290 | 0.009971 |
Impact_Binary | 0.161471 | 0.522415 | 0.205882 | 0.134348 | 0.193274 |
Bounds_Binary | 0.079747 | 0.257381 | 0.123529 | 0.070377 | 0.092531 |
feature_importances.plot(kind='bar')
<Axes: >
# Calculate the correlation matrix
corr_matrix = feature_importances.corr()
# Create a heatmap of the correlation matrix
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()
X_test_initial = X_test.copy()
X_test_loaded = data_cleaner.transform(X_test_initial) #data cleaning
X_test_preprocessed = X_test_loaded.drop(columns = drop) #columns drop
X_test_non_binary = X_test_preprocessed.copy() #copy df
X_test_Stress_Data = X_test_non_binary.copy()
X_test_Stress_Data = AnyGreaterThanZeroTransformer(columns=['Controled stress',' stress', 'Before Controled stress', 'After controlled stress'],
c_name = 'Stress_Binary')\
.transform(X_test_non_binary) #binary stress
X_test_Impact_Data = AnyGreaterThanZeroTransformer(columns=['MajorImpacts','MinorImpacts']
,c_name = 'Impact_Binary')\
.transform(X_test_Stress_Data) #binary impacts
X_test_Av_Data = AnyGreaterThanZeroTransformer(columns=['AvForceDevRate', 'AvStepImpulse', 'AvStepPeriod']
,c_name = 'Av_Binary')\
.transform(X_test_Impact_Data) #binary av columns
X_test_Accel_Data = AnyGreaterThanZeroTransformer(columns=['PeakAcceleration']
,c_name = 'Acc_Binary')\
.transform(X_test_Impact_Data) #binary accel
X_test_Bounds_Data = AnyGreaterThanZeroTransformer(columns=['Bounds']
,c_name = 'Bounds_Binary')\
.transform(X_test_Accel_Data) #binary bounds
X_test_outlier = X_test_Bounds_Data.copy() #copying the data before outlier removal
X_test_less_outliers = outlier_replacer.transform(X_test_outlier) #outliers removal
X_test_scaled = X_test_less_outliers.copy() #outliers df copy
X_test_scaled[non_binary_columns] = scaler.transform(X_test_scaled[non_binary_columns]) # replace the og df
y_test['Target'] = label_encoder.transform(y_test[['Activities Detailed']]) #transforming labels
y_test_target = y_test[['Target']] #predictor class
loaded_model = joblib.load('ensamble_fitted_model_summerschool2023_stress_HR_BR_HRV.joblib')
loaded_model
VotingClassifier(estimators=[('rfc', RandomForestClassifier(max_depth=36, min_samples_leaf=11, min_samples_split=19, n_estimators=271, random_state=42)), ('xgb', XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=0.6220627611238871, early_stopping_rounds=None, enable_categorical=False, eval... ('ada', AdaBoostClassifier(learning_rate=0.2379351625419417, n_estimators=170, random_state=42)), ('gbc', GradientBoostingClassifier(learning_rate=0.01208563915935721, max_depth=9, n_estimators=186, random_state=42, subsample=0.7424149856794916)), ('dtc', DecisionTreeClassifier(max_depth=16, min_samples_leaf=4, min_samples_split=9, random_state=42))], voting='soft')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
VotingClassifier(estimators=[('rfc', RandomForestClassifier(max_depth=36, min_samples_leaf=11, min_samples_split=19, n_estimators=271, random_state=42)), ('xgb', XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=0.6220627611238871, early_stopping_rounds=None, enable_categorical=False, eval... ('ada', AdaBoostClassifier(learning_rate=0.2379351625419417, n_estimators=170, random_state=42)), ('gbc', GradientBoostingClassifier(learning_rate=0.01208563915935721, max_depth=9, n_estimators=186, random_state=42, subsample=0.7424149856794916)), ('dtc', DecisionTreeClassifier(max_depth=16, min_samples_leaf=4, min_samples_split=9, random_state=42))], voting='soft')
RandomForestClassifier(max_depth=36, min_samples_leaf=11, min_samples_split=19, n_estimators=271, random_state=42)
XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=0.6220627611238871, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=0.06048731265187917, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=9, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=195, n_jobs=None, num_parallel_tree=None, predictor=None, random_state=42, ...)
AdaBoostClassifier(learning_rate=0.2379351625419417, n_estimators=170, random_state=42)
GradientBoostingClassifier(learning_rate=0.01208563915935721, max_depth=9, n_estimators=186, random_state=42, subsample=0.7424149856794916)
DecisionTreeClassifier(max_depth=16, min_samples_leaf=4, min_samples_split=9, random_state=42)
y_test_pred = loaded_model.predict(X_test_scaled[['HR','HRV','BR','Posture', 'Activity', 'Impact_Binary','Bounds_Binary']])
# Generate a classification report
test_class_report = classification_report(y_test_pred,y_test_target )
print("Classification Report:")
print(test_class_report)
Classification Report: precision recall f1-score support 0 0.93 0.96 0.95 28 1 0.92 0.92 0.92 213 2 0.94 0.98 0.96 969 3 0.58 0.96 0.72 745 4 0.93 0.85 0.89 1781 5 0.99 0.99 0.99 21762 6 0.97 0.99 0.98 617 7 0.80 0.83 0.82 829 8 0.72 0.84 0.78 2161 9 0.94 0.87 0.91 25781 10 0.98 0.99 0.98 1363 11 0.96 0.95 0.96 1095 12 0.85 0.81 0.83 1144 13 0.99 0.97 0.98 3601 14 0.93 0.96 0.95 1058 15 0.75 0.81 0.78 2586 16 0.92 0.95 0.93 22428 17 0.80 0.89 0.85 1299 18 0.53 0.71 0.61 70 19 0.88 0.98 0.93 164 accuracy 0.93 89694 macro avg 0.87 0.91 0.88 89694 weighted avg 0.93 0.93 0.93 89694
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.colors as colors
# Define a colormap where zero values are white
cmap = colors.ListedColormap(['white', 'lightpink', 'lightblue'])
# Define the bounds for each color
bounds = [0,1,2,3]
# Create a normalization based on the bounds
norm = colors.BoundaryNorm(bounds, cmap.N)
# Compute the confusion matrix
cm = confusion_matrix(y_test_target, y_test_pred)
# Get sorted labels
labels = sorted(y_test['Activities Detailed'].unique())
# Create the heatmap
plt.figure(figsize=(10,7))
sns.heatmap(cm, annot=False, cmap=cmap, norm=norm, xticklabels=labels, yticklabels=labels)
plt.title('Confusion Matrix based on Frequencies', size=16)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()
# Generate a classification report
class_report = classification_report(y_pred_train, y_train_target, output_dict=True)
test_class_report = classification_report(y_test_pred,y_test_target, output_dict=True)
# Create dataframes
df_test = pd.DataFrame(test_class_report).transpose()
df = pd.DataFrame(class_report).transpose()
# Add suffixes to the column names
df_test.columns = [str(col) + '_Test' for col in df_test.columns]
df.columns = [str(col) for col in df.columns]
# Concatenate the dataframes
result_df = pd.concat([df, df_test], axis=1).reset_index(drop=False).iloc[:-3,:]
macro_df = pd.concat([df, df_test], axis=1).reset_index(drop=False).iloc[-3:,:]
result_df['index'] = result_df['index'].astype(int)
result_df['Label'] = label_encoder.inverse_transform(result_df['index'])
result_df
index | precision | recall | f1-score | support | precision_Test | recall_Test | f1-score_Test | support_Test | Label | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0.966102 | 1.000000 | 0.982759 | 57.0 | 0.931034 | 0.964286 | 0.947368 | 28.0 | Coherent Breathing |
1 | 1 | 0.974419 | 0.935268 | 0.954442 | 448.0 | 0.924528 | 0.920188 | 0.922353 | 213.0 | Play Quitar |
2 | 2 | 0.966423 | 0.991018 | 0.978566 | 2004.0 | 0.940711 | 0.982456 | 0.961131 | 969.0 | cognitive workout |
3 | 3 | 0.647551 | 0.967857 | 0.775948 | 1680.0 | 0.576395 | 0.957047 | 0.719475 | 745.0 | competition |
4 | 4 | 0.952439 | 0.866832 | 0.907621 | 3627.0 | 0.927429 | 0.846715 | 0.885236 | 1781.0 | creative writing |
5 | 5 | 0.991900 | 0.998121 | 0.995001 | 44166.0 | 0.988853 | 0.994670 | 0.991753 | 21762.0 | dream |
6 | 6 | 0.984252 | 0.998403 | 0.991277 | 1252.0 | 0.974400 | 0.987034 | 0.980676 | 617.0 | leisure |
7 | 7 | 0.861206 | 0.862678 | 0.861941 | 1755.0 | 0.799076 | 0.834741 | 0.816519 | 829.0 | meeting with client |
8 | 8 | 0.764510 | 0.883726 | 0.819807 | 4412.0 | 0.723726 | 0.841277 | 0.778087 | 2161.0 | negotiation |
9 | 9 | 0.957183 | 0.895210 | 0.925160 | 51942.0 | 0.940739 | 0.873124 | 0.905671 | 25781.0 | office work |
10 | 10 | 0.990298 | 0.990654 | 0.990476 | 2782.0 | 0.980306 | 0.986060 | 0.983175 | 1363.0 | phone calls |
11 | 11 | 0.992230 | 0.978810 | 0.985474 | 2218.0 | 0.963788 | 0.947945 | 0.955801 | 1095.0 | play piano |
12 | 12 | 0.925460 | 0.837464 | 0.879266 | 2461.0 | 0.847767 | 0.812937 | 0.829987 | 1144.0 | presentation |
13 | 13 | 0.995385 | 0.981658 | 0.988474 | 7251.0 | 0.987791 | 0.966121 | 0.976836 | 3601.0 | public speaking |
14 | 14 | 0.949458 | 0.972723 | 0.960950 | 2163.0 | 0.931256 | 0.960302 | 0.945556 | 1058.0 | reading book |
15 | 15 | 0.804282 | 0.863248 | 0.832723 | 5265.0 | 0.748473 | 0.805491 | 0.775936 | 2586.0 | sales |
16 | 16 | 0.932961 | 0.963384 | 0.947928 | 45417.0 | 0.918788 | 0.946317 | 0.932349 | 22428.0 | sport |
17 | 17 | 0.875300 | 0.949108 | 0.910711 | 2692.0 | 0.804590 | 0.890685 | 0.845451 | 1299.0 | training |
18 | 18 | 0.715789 | 0.971429 | 0.824242 | 140.0 | 0.531915 | 0.714286 | 0.609756 | 70.0 | walking meeting |
19 | 19 | 0.967914 | 0.975741 | 0.971812 | 371.0 | 0.875000 | 0.981707 | 0.925287 | 164.0 | writing study |
result_df.iloc[:-3,:][['precision','precision_Test']].plot(kind='bar')
<Axes: >