import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import joblib
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
# Importing the pandas library and reading the CSV file 'Zephyr Data Emo with new categories-Copy1.csv'
# The 'engine' parameter is set to 'python' to handle non-default delimiter options.
# The 'delimiter' parameter is set to ';' to specify that the data in the CSV file is delimited by semicolons.
raw_core = pd.read_csv('Zephyr Data Emo with new categories-Copy1.csv', engine='python', delimiter=';')
# Get the value counts for the 'Activities Detailed' column
value_counts = raw_core['Activities Detailed'].value_counts()
# Create the bar plot
plt.figure(figsize=(10, 6)) # Optional: Adjust the figure size if needed
plt.bar(value_counts.index, value_counts.values)
# Optional: Rotate the x-axis labels if they are too long
plt.xticks(rotation=90)
# Optional: Add labels and title
plt.xlabel('Activities Detailed')
plt.ylabel('Count')
plt.title('Value Counts of Activities Detailed')
# Show the plot
plt.tight_layout() # Optional: To avoid cutoff of labels
plt.show()
# Creating a new DataFrame 'raw' by filtering out rows where the 'Activities Detailed' column is not equal to 'papper work'.
# This step is essentially excluding rows with the value 'papper work' in the 'Activities Detailed' column which has only 1 entry.
raw = raw_core[raw_core['Activities Detailed'] != 'papper work']
# Assuming 'raw' is a DataFrame containing the data.
# Extracting the 'Activities Detailed' column and assigning it to the 'y' variable.
y = raw[['Activities Detailed']]
# Creating a new DataFrame 'X' by dropping the 'Activities Detailed' and 'Activities' columns from the original DataFrame 'raw'.
# 'X' will contain the remaining features (columns) of the data.
X = raw.drop(columns=['Activities Detailed', 'Activities'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify = y)
class DataCleanerTransformer:
def __init__(self):
# Define the columns that need to be converted to float
self.float_columns = ['Activity', 'PeakAcceleration', 'ECGAmplitude', 'ECGNoise', 'CoreTemp',
'AvStepPeriod', 'AvForceDevRate', 'AvStepImpulse']
def fit(self, data):
# Nothing to fit in this case, so we return self.
return self
def transform(self, data):
# Convert selected columns to float
data[self.float_columns] = data[self.float_columns].apply(lambda x: x.str.replace(',', '.').astype(float))
# Convert 'Time' column to pandas datetime format
data['Time_New'] = pd.to_datetime(data['Time'], format="%d.%m.%Y %H:%M:%S")
# Fill NaN values with 0
data = data.fillna(0)
return data
def fit_transform(self, data):
# Fit the transformer (in this case, there's nothing to fit)
self.fit(data)
# Transform the data using the transform method
return self.transform(data)
# Example usage:
# Instantiate the DataCleanerTransformer class
data_cleaner = DataCleanerTransformer()
X_train_loaded = data_cleaner.fit_transform(X_train)
drop = [
#The model overfits and remmember when what activities were done so they are removed
'Year', 'Month', 'Weekday', 'Time', 'Time_New','Date','Hour',
#We have few participants so the model is good at overfitting who did what
'Name of the volunteer',
#These are health KPIs of how the Holter machine works and aren't dependant on activity.
'BRAmplitude','ECGAmplitude', 'HRConfidence',
#These measure how the machine works and whether it detects noise. This is what we model.
'ECGNoise','Jumps','JumpFlightTime'
#Stress is mannualy inputted
# ,'Controled stress',' stress', 'Before Controled stress', 'After controlled stress'
]
X_preprocessed = X_train_loaded.drop(columns = drop)
ratios = (X_preprocessed.nunique() / X_preprocessed.count()).sort_values(ascending=True)
# Create a bar plot to visualize the ratios
plt.figure(figsize=(10, 6))
plt.barh(ratios.index, ratios.values)
plt.xlabel('Ratio of Unique Values to Total Count')
plt.ylabel('Columns')
plt.title('Ratio of Unique Values to Total Count for each Column')
plt.tight_layout()
plt.show()
import matplotlib.pyplot as plt
# Assuming you have already calculated the ratio of variance to mean
ratios = (X_preprocessed.var() / X_preprocessed.mean()).sort_values(ascending=True)
# Create a bar plot to visualize the ratios
plt.figure(figsize=(10, 6))
# Color negative values in red, and non-negative values in the default color (blue)
colors = ['red' if value < 0 else 'blue' for value in ratios.values]
plt.barh(ratios.index, ratios.values, color=colors)
plt.xlabel('Ratio of Variance to Mean')
plt.ylabel('Columns')
plt.title('Ratio of Variance to Mean for each Column')
plt.tight_layout()
plt.show()
import matplotlib.pyplot as plt
# Assuming you have already calculated the ratio of unique values to total count and variance to mean
unique_ratios = (X_preprocessed.nunique() / X_preprocessed.count()).sort_values(ascending=True)
variance_ratios = (X_preprocessed.var() / X_preprocessed.mean()).sort_values(ascending=True)
# Create a scatter plot to combine the two ratios
plt.figure(figsize=(10, 6))
# Plot the scatter plot with unique-to-total count ratios on the x-axis and variance-to-mean ratios on the y-axis
plt.scatter(unique_ratios.values, variance_ratios.values, color=colors)
# Annotate each point with the column name next to the data point
for i, col in enumerate(unique_ratios.index):
plt.annotate(col, xy=(unique_ratios.values[i], variance_ratios.values[i]),
xytext=(5, 0), textcoords='offset points')
# Set log scales for both x and y axes
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Unique-to-Total Count Ratio')
plt.ylabel('Variance-to-Mean Ratio')
plt.title('Unique-to-Total Count vs. Variance-to-Mean Ratio for each Column (Log Scales)')
plt.tight_layout()
plt.show()
class AnyGreaterThanZeroTransformer:
def __init__(self, columns, c_name):
self.columns = columns
self.c_name = c_name
def fit(self, X, y=None):
return self
def transform(self, X):
# Concatenate the names of the specified columns into a new column
X[f'{self.c_name}'] = (X[self.columns] > 0).any(axis=1).astype(int)
# Drop the original specified columns as they are no longer needed
X.drop(columns=self.columns, inplace=True)
return X
def fit_transform(self, X, y=None):
return self.fit(X).transform(X)
X_non_binary = X_preprocessed.copy()
# Plot histograms for the specified columns
plt.figure(figsize=(10, 6))
X_preprocessed.hist(column=['Controled stress', ' stress', 'Before Controled stress', 'After controlled stress'], bins=10)
plt.xlabel('Values')
plt.ylabel('Frequency')
# plt.title('Histograms for the Specified Columns')
plt.tight_layout()
plt.show()
# Plot boxplots for the specified columns
plt.figure(figsize=(10, 6))
sns.boxplot(data=X_preprocessed[['Controled stress', ' stress', 'Before Controled stress', 'After controlled stress']])
plt.xlabel('Columns')
plt.ylabel('Values')
# plt.title('Boxplots for the Specified Columns')
plt.tight_layout()
plt.show()
<Figure size 1000x600 with 0 Axes>
# Use the fit_transform method to create the new column 'AnyGreaterThanZero'
Stress_Data = X_non_binary.copy()
Stress_Data = AnyGreaterThanZeroTransformer(columns=['Controled stress',' stress', 'Before Controled stress', 'After controlled stress'],
c_name = 'Stress_Binary')\
.fit_transform(X_non_binary)
# Plot histograms for the specified columns
plt.figure(figsize=(10, 6))
X_preprocessed.hist(column=['MajorImpacts','MinorImpacts'], bins=10)
plt.xlabel('Values')
plt.ylabel('Frequency')
# plt.title('Histograms for the Specified Columns')
plt.tight_layout()
plt.show()
# Plot boxplots for the specified columns
plt.figure(figsize=(10, 6))
sns.boxplot(data=X_preprocessed[['MajorImpacts','MinorImpacts']])
plt.xlabel('Columns')
plt.ylabel('Values')
# plt.title('Boxplots for the Specified Columns')
plt.tight_layout()
plt.show()
<Figure size 1000x600 with 0 Axes>
Impact_Data = AnyGreaterThanZeroTransformer(columns=['MajorImpacts','MinorImpacts']
,c_name = 'Impact_Binary')\
.fit_transform(Stress_Data)
# Plot histograms for the specified columns
plt.figure(figsize=(10, 6))
X_preprocessed.hist(column=['AvForceDevRate', 'AvStepImpulse', 'AvStepPeriod'], bins=10)
plt.xlabel('Values')
plt.ylabel('Frequency')
# plt.title('Histograms for the Specified Columns')
plt.tight_layout()
plt.show()
# Plot boxplots for the specified columns
plt.figure(figsize=(10, 6))
sns.boxplot(data=X_preprocessed[['AvForceDevRate', 'AvStepImpulse', 'AvStepPeriod']])
plt.xlabel('Columns')
plt.ylabel('Values')
# plt.title('Boxplots for the Specified Columns')
plt.tight_layout()
plt.show()
<Figure size 1000x600 with 0 Axes>
Av_Data = AnyGreaterThanZeroTransformer(columns=['AvForceDevRate', 'AvStepImpulse', 'AvStepPeriod']
,c_name = 'Av_Binary')\
.fit_transform(Stress_Data)
# Plot histograms for the specified columns
plt.figure(figsize=(10, 6))
X_preprocessed.hist(column=['PeakAcceleration', 'PeakAccelPhi', 'peakAccelTheta'], bins=10)
plt.xlabel('Values')
plt.ylabel('Frequency')
# plt.title('Histograms for the Specified Columns')
plt.tight_layout()
plt.show()
# Plot boxplots for the specified columns
plt.figure(figsize=(10, 6))
sns.boxplot(data=X_preprocessed[['PeakAcceleration', 'PeakAccelPhi', 'peakAccelTheta']])
plt.xlabel('Columns')
plt.ylabel('Values')
# plt.title('Boxplots for the Specified Columns')
plt.tight_layout()
plt.show()
<Figure size 1000x600 with 0 Axes>
Accel_Data = AnyGreaterThanZeroTransformer(columns=['PeakAcceleration']
,c_name = 'Acc_Binary')\
.fit_transform(Stress_Data)
# Plot histograms for the specified columns
plt.figure(figsize=(10, 6))
X_preprocessed.hist(column=['Bounds', 'RunSteps'], bins=10)
plt.xlabel('Values')
plt.ylabel('Frequency')
# plt.title('Histograms for the Specified Columns')
plt.tight_layout()
plt.show()
# Plot boxplots for the specified columns
plt.figure(figsize=(10, 6))
sns.boxplot(data=X_preprocessed[['Bounds', 'RunSteps']])
plt.xlabel('Columns')
plt.ylabel('Values')
# plt.title('Boxplots for the Specified Columns')
plt.tight_layout()
plt.show()
<Figure size 1000x600 with 0 Axes>
Bounds_Data = AnyGreaterThanZeroTransformer(columns=['Bounds']
,c_name = 'Bounds_Binary')\
.fit_transform(Stress_Data)
X_train_outlier = Bounds_Data.copy()
# Identify non-binary columns
non_binary_columns = X_train_outlier.columns[X_train_outlier.nunique() > 2]
binary_clumns = X_train_outlier.drop(columns = non_binary_columns).columns.tolist()
# Plot boxplots for the specified columns
plt.figure(figsize=(10, 6))
sns.boxplot(data=X_train_outlier[non_binary_columns])
plt.xlabel('Columns')
plt.ylabel('Values')
# plt.title('Boxplots for the Specified Columns')
plt.tight_layout()
plt.show()
class OutlierReplacer:
def __init__(self, columns):
self.columns = columns
self.iqr_ranges = {}
def fit(self, X, y=None):
for column in self.columns:
Q1 = X[column].quantile(0.25)
Q3 = X[column].quantile(0.75)
IQR = Q3 - Q1
self.iqr_ranges[column] = (Q1 - 1.5 * IQR, Q3 + 1.5 * IQR)
return self
def transform(self, X):
X_transformed = X.copy()
for column in self.columns:
if column in self.iqr_ranges:
outliers = (X_transformed[column] < self.iqr_ranges[column][0]) | (X_transformed[column] > self.iqr_ranges[column][1])
detected_outliers = X_transformed[outliers]
clean_data_iqr = X_transformed[~outliers].fillna(0)
mean_value = clean_data_iqr[column].mean()
noise = np.random.normal(0, 0.1, len(detected_outliers))
mean_value_with_noise = noise + mean_value
X_transformed.loc[outliers, column] = mean_value_with_noise
return X_transformed
def fit_transform(self, X, y=None):
return self.fit(X).transform(X)
# Instantiate the OutlierReplacer class and fit_transform on training data
outlier_replacer = OutlierReplacer(columns=non_binary_columns).fit(X_train_outlier)
X_train_less_outliers = outlier_replacer.transform(X_train_outlier)
scaler = StandardScaler().fit(X_train_less_outliers)
# Make a copy of the original DataFrame
X_train_scaled = X_train_less_outliers.copy()
# Instantiate the StandardScaler with the selected columns and fit and transform them
scaler = StandardScaler().fit(X_train_less_outliers[non_binary_columns])
X_train_scaled[non_binary_columns] = scaler.transform(X_train_less_outliers[non_binary_columns])
label_encoder = LabelEncoder().fit(y_train)
C:\Users\admin\AppData\Roaming\Python\Python39\site-packages\sklearn\preprocessing\_label.py:97: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True)
y_train['Target'] = label_encoder.transform(y_train)
C:\Users\admin\AppData\Roaming\Python\Python39\site-packages\sklearn\preprocessing\_label.py:132: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
y_train_target = y_train[['Target']]
X_test_initial = X_test.copy()
X_test_loaded = data_cleaner.transform(X_test_initial) #data cleaning
X_test_preprocessed = X_test_loaded.drop(columns = drop) #columns drop
X_test_non_binary = X_test_preprocessed.copy() #copy df
X_test_Stress_Data = X_test_non_binary.copy()
X_test_Stress_Data = AnyGreaterThanZeroTransformer(columns=['Controled stress',' stress', 'Before Controled stress', 'After controlled stress'],
c_name = 'Stress_Binary')\
.transform(X_test_non_binary) #binary stress
X_test_Impact_Data = AnyGreaterThanZeroTransformer(columns=['MajorImpacts','MinorImpacts']
,c_name = 'Impact_Binary')\
.transform(X_test_Stress_Data) #binary impacts
X_test_Av_Data = AnyGreaterThanZeroTransformer(columns=['AvForceDevRate', 'AvStepImpulse', 'AvStepPeriod']
,c_name = 'Av_Binary')\
.transform(X_test_Impact_Data) #binary av columns
X_test_Accel_Data = AnyGreaterThanZeroTransformer(columns=['PeakAcceleration']
,c_name = 'Acc_Binary')\
.transform(X_test_Impact_Data) #binary accel
X_test_Bounds_Data = AnyGreaterThanZeroTransformer(columns=['Bounds']
,c_name = 'Bounds_Binary')\
.transform(X_test_Accel_Data) #binary bounds
X_test_outlier = X_test_Bounds_Data.copy() #copying the data before outlier removal
X_test_less_outliers = outlier_replacer.transform(X_test_outlier) #outliers removal
X_test_scaled = X_test_less_outliers.copy() #outliers df copy
X_test_scaled[non_binary_columns] = scaler.transform(X_test_scaled[non_binary_columns]) # replace the og df
y_test['Target'] = label_encoder.transform(y_test) #transforming labels
y_test_target = y_test[['Target']] #predictor class
C:\Users\admin\AppData\Roaming\Python\Python39\site-packages\sklearn\preprocessing\_label.py:132: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
X_train_scaled
HR | BR | Posture | Activity | HRV | CoreTemp | ImpulseLoad | WalkSteps | RunSteps | PeakAccelPhi | peakAccelTheta | Stress_Binary | Impact_Binary | Av_Binary | Acc_Binary | Bounds_Binary | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
194779 | -0.900290 | 1.463172 | 0.706214 | -0.222442 | 0.000041 | 0.298850 | -0.749086 | -0.590786 | -0.858415 | 0.871294 | -0.381234 | 1 | 0 | 0 | 1 | 0 |
144310 | -1.460079 | 0.086207 | 0.002183 | -0.222442 | 1.715629 | -1.336719 | -0.091310 | -0.412061 | 0.119176 | -1.970502 | 1.179662 | 1 | 1 | 0 | 1 | 0 |
220145 | -0.775892 | 0.636993 | 1.552896 | -0.222442 | 0.417740 | 0.707743 | -0.483764 | -0.371043 | -0.858415 | 0.169616 | -0.542706 | 1 | 0 | 0 | 1 | 0 |
141697 | -0.775892 | 0.086207 | 0.002407 | -0.222442 | -0.717913 | -0.518934 | -0.256031 | -0.423781 | 0.119176 | -1.514411 | -0.865650 | 1 | 1 | 0 | 1 | 0 |
214535 | 0.592482 | 0.912386 | 1.801920 | -0.222442 | -0.839591 | 0.298850 | -0.883957 | -0.728492 | -0.369620 | -0.040887 | -0.506823 | 1 | 0 | 0 | 1 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
49582 | 1.587663 | 0.636993 | -0.289882 | 0.470511 | -0.150087 | 1.525527 | 0.000052 | 0.000328 | -0.037315 | 0.625707 | 1.098926 | 1 | 1 | 0 | 1 | 0 |
173083 | 0.468084 | -1.841545 | -1.186368 | 0.239527 | 0.417740 | 0.298850 | -1.013301 | -0.857408 | -1.347211 | -0.075971 | 1.233486 | 1 | 0 | 0 | 1 | 0 |
15965 | -0.153904 | 0.361600 | 0.905433 | -0.222442 | -0.717913 | -0.110042 | 0.956708 | 1.313657 | 1.585563 | 0.660791 | -0.659325 | 0 | 0 | 0 | 1 | 0 |
43886 | 1.027873 | 1.187779 | 1.552896 | 0.540247 | 2.202338 | 0.707743 | -0.000015 | 0.000591 | -0.048939 | 0.695875 | -1.646098 | 1 | 0 | 0 | 1 | 1 |
131493 | 1.712060 | -0.464580 | 0.307776 | 0.470511 | -0.758472 | -0.067477 | 0.444859 | 0.346786 | 2.074358 | 1.046714 | 0.623480 | 1 | 1 | 0 | 1 | 0 |
182103 rows × 16 columns
def assign_labels_to_clusters(df_kmeans):
"""
This function assigns a label to each cluster by grouping the data by label and cluster.
It counts the occurrences of each label in each cluster, then determines the dominant label for each cluster.
The dominant label for a cluster is the label that occurs most frequently in that cluster.
It then creates a dictionary that maps each label to its dominant cluster.
Finally, it creates a new 'assigned_cluster' column in the dataframe where each row contains the dominant cluster of its label.
Parameters:
df_kmeans (pd.DataFrame): A DataFrame where each row represents a data point, and the columns represent the following:
'x' and 'y' are the coordinates of the data point, 'cluster' is the assigned cluster from K-means,
and 'label' is the true label of the data point.
Returns:
df_kmeans (pd.DataFrame): The same DataFrame with an additional 'assigned_cluster' column.
"""
# Group the data by 'label' and 'cluster' and count the occurrences
label_cluster_counts = df_kmeans.groupby(['label', 'cluster']).size().reset_index(name='count')
# Find the dominant cluster for each label
idx = label_cluster_counts.groupby(['label'])['count'].transform(max) == label_cluster_counts['count']
dominant_clusters = label_cluster_counts[idx]
# Create a dictionary mapping each label to its dominant cluster
label_to_cluster = dict(zip(dominant_clusters['label'], dominant_clusters['cluster']))
# Assign labels to clusters based on the dominant clusters
df_kmeans['assigned_cluster'] = df_kmeans['label'].map(label_to_cluster)
return df_kmeans
def perform_tsne_with_kmeans(X_data, y_data, perplexities, n_clusters_range, n_iter=1000, img_name_prefix='t-sne'):
"""
This function performs t-SNE dimensionality reduction followed by K-Means clustering.
It does this for a range of perplexities and a range of numbers of clusters.
For each combination of perplexity and number of clusters, it creates two scatter plots:
one for the t-SNE reduced data colored by true label, and another for the K-Means clusters.
It also assigns a label to each cluster based on the majority of points and displays the labels included in each cluster.
Parameters:
X_data (np.array): The high-dimensional data to reduce and cluster.
y_data (np.array): The true labels of the data.
perplexities (list): A list of perplexities to use for t-SNE.
n_clusters_range (list): A list of numbers of clusters to use for K-Means.
n_iter (int, optional): The number of iterations for t-SNE. Default is 1000.
img_name_prefix (str, optional): The prefix for the saved plot image files. Default is 't-sne'.
Returns:
None
"""
colors = sns.color_palette('tab20', n_colors=20) # Choose a larger color palette
for index, perplexity in enumerate(perplexities):
X_reduced = TSNE(verbose=2, perplexity=perplexity).fit_transform(X_data)
df = pd.DataFrame({'x': X_reduced[:, 0], 'y': X_reduced[:, 1], 'label': y_data})
unique_labels = df['label'].unique()
# set for each level different color
color_dict = dict(zip(unique_labels, colors[:len(unique_labels)]))
df['color'] = df['label'].map(color_dict)
# plot the results
plt.figure(figsize=(14, 10))
sns.scatterplot(data=df, x='x', y='y', hue='label', style='label', palette=color_dict, markers=True)
plt.title("Perplexity: {} and Max_iter: {}".format(perplexity, n_iter))
img_name = img_name_prefix + '_perp_{}_iter_{}.png'.format(perplexity, n_iter)
plt.savefig(img_name)
plt.show()
print('All good')
# Apply K-Means clustering on the t-SNE reduced data
for n_clusters in n_clusters_range:
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(X_reduced)
# Prepare the data for seaborn
df_kmeans = pd.DataFrame({'x': X_reduced[:, 0], 'y': X_reduced[:, 1], 'cluster': clusters, 'label': y_data})
# Create a dictionary mapping each cluster to a color
color_dict_kmeans = dict(zip(range(n_clusters), colors[:n_clusters]))
# Map the colors to the 'cluster' column
df_kmeans['color'] = df_kmeans['cluster'].map(color_dict_kmeans)
# Draw the K-Means clustering plot
plt.figure(figsize=(14, 10))
sns.scatterplot(data=df_kmeans, x='x', y='y', hue='cluster', palette=color_dict_kmeans, markers=True)
plt.title("K-Means Clustering with {} clusters".format(n_clusters))
img_name = img_name_prefix + '_perp_{}_iter_{}_kmeans_{}.png'.format(perplexity, n_iter, n_clusters)
print('Saving this K-Means clustering plot as an image in the present working directory...')
plt.savefig(img_name)
plt.show()
print('Done')
# Assign labels to clusters based on the majority of points
df_kmeans = assign_labels_to_clusters(df_kmeans)
# Display labels included in each cluster
for cluster_num in range(n_clusters):
cluster_labels = df_kmeans[df_kmeans['assigned_cluster'] == cluster_num]['label'].unique()
print("Cluster {}: {}".format(cluster_num, ", ".join(cluster_labels)))
X_train_scaled.shape, y_train.shape
((182103, 16), (182103, 2))
perform_tsne_with_kmeans(X_data = X_train_scaled, y_data = y_train['Activities Detailed'].values, perplexities=[2,10,20], n_clusters_range=[4,5, 6, 7,8])
[t-SNE] Computing 7 nearest neighbors... [t-SNE] Indexed 182103 samples in 0.010s... [t-SNE] Computed neighbors for 182103 samples in 29.406s... [t-SNE] Computed conditional probabilities for sample 1000 / 182103 [t-SNE] Computed conditional probabilities for sample 2000 / 182103 [t-SNE] Computed conditional probabilities for sample 3000 / 182103 [t-SNE] Computed conditional probabilities for sample 4000 / 182103 [t-SNE] Computed conditional probabilities for sample 5000 / 182103 [t-SNE] Computed conditional probabilities for sample 6000 / 182103 [t-SNE] Computed conditional probabilities for sample 7000 / 182103 [t-SNE] Computed conditional probabilities for sample 8000 / 182103 [t-SNE] Computed conditional probabilities for sample 9000 / 182103 [t-SNE] Computed conditional probabilities for sample 10000 / 182103 [t-SNE] Computed conditional probabilities for sample 11000 / 182103 [t-SNE] Computed conditional probabilities for sample 12000 / 182103 [t-SNE] Computed conditional probabilities for sample 13000 / 182103 [t-SNE] Computed conditional probabilities for sample 14000 / 182103 [t-SNE] Computed conditional probabilities for sample 15000 / 182103 [t-SNE] Computed conditional probabilities for sample 16000 / 182103 [t-SNE] Computed conditional probabilities for sample 17000 / 182103 [t-SNE] Computed conditional probabilities for sample 18000 / 182103 [t-SNE] Computed conditional probabilities for sample 19000 / 182103 [t-SNE] Computed conditional probabilities for sample 20000 / 182103 [t-SNE] Computed conditional probabilities for sample 21000 / 182103 [t-SNE] Computed conditional probabilities for sample 22000 / 182103 [t-SNE] Computed conditional probabilities for sample 23000 / 182103 [t-SNE] Computed conditional probabilities for sample 24000 / 182103 [t-SNE] Computed conditional probabilities for sample 25000 / 182103 [t-SNE] Computed conditional probabilities for sample 26000 / 182103 [t-SNE] Computed conditional probabilities for sample 27000 / 182103 [t-SNE] Computed conditional probabilities for sample 28000 / 182103 [t-SNE] Computed conditional probabilities for sample 29000 / 182103 [t-SNE] Computed conditional probabilities for sample 30000 / 182103 [t-SNE] Computed conditional probabilities for sample 31000 / 182103 [t-SNE] Computed conditional probabilities for sample 32000 / 182103 [t-SNE] Computed conditional probabilities for sample 33000 / 182103 [t-SNE] Computed conditional probabilities for sample 34000 / 182103 [t-SNE] Computed conditional probabilities for sample 35000 / 182103 [t-SNE] Computed conditional probabilities for sample 36000 / 182103 [t-SNE] Computed conditional probabilities for sample 37000 / 182103 [t-SNE] Computed conditional probabilities for sample 38000 / 182103 [t-SNE] Computed conditional probabilities for sample 39000 / 182103 [t-SNE] Computed conditional probabilities for sample 40000 / 182103 [t-SNE] Computed conditional probabilities for sample 41000 / 182103 [t-SNE] Computed conditional probabilities for sample 42000 / 182103 [t-SNE] Computed conditional probabilities for sample 43000 / 182103 [t-SNE] Computed conditional probabilities for sample 44000 / 182103 [t-SNE] Computed conditional probabilities for sample 45000 / 182103 [t-SNE] Computed conditional probabilities for sample 46000 / 182103 [t-SNE] Computed conditional probabilities for sample 47000 / 182103 [t-SNE] Computed conditional probabilities for sample 48000 / 182103 [t-SNE] Computed conditional probabilities for sample 49000 / 182103 [t-SNE] Computed conditional probabilities for sample 50000 / 182103 [t-SNE] Computed conditional probabilities for sample 51000 / 182103 [t-SNE] Computed conditional probabilities for sample 52000 / 182103 [t-SNE] Computed conditional probabilities for sample 53000 / 182103 [t-SNE] Computed conditional probabilities for sample 54000 / 182103 [t-SNE] Computed conditional probabilities for sample 55000 / 182103 [t-SNE] Computed conditional probabilities for sample 56000 / 182103 [t-SNE] Computed conditional probabilities for sample 57000 / 182103 [t-SNE] Computed conditional probabilities for sample 58000 / 182103 [t-SNE] Computed conditional probabilities for sample 59000 / 182103 [t-SNE] Computed conditional probabilities for sample 60000 / 182103 [t-SNE] Computed conditional probabilities for sample 61000 / 182103 [t-SNE] Computed conditional probabilities for sample 62000 / 182103 [t-SNE] Computed conditional probabilities for sample 63000 / 182103 [t-SNE] Computed conditional probabilities for sample 64000 / 182103 [t-SNE] Computed conditional probabilities for sample 65000 / 182103 [t-SNE] Computed conditional probabilities for sample 66000 / 182103 [t-SNE] Computed conditional probabilities for sample 67000 / 182103 [t-SNE] Computed conditional probabilities for sample 68000 / 182103 [t-SNE] Computed conditional probabilities for sample 69000 / 182103 [t-SNE] Computed conditional probabilities for sample 70000 / 182103 [t-SNE] Computed conditional probabilities for sample 71000 / 182103 [t-SNE] Computed conditional probabilities for sample 72000 / 182103 [t-SNE] Computed conditional probabilities for sample 73000 / 182103 [t-SNE] Computed conditional probabilities for sample 74000 / 182103 [t-SNE] Computed conditional probabilities for sample 75000 / 182103 [t-SNE] Computed conditional probabilities for sample 76000 / 182103 [t-SNE] Computed conditional probabilities for sample 77000 / 182103 [t-SNE] Computed conditional probabilities for sample 78000 / 182103 [t-SNE] Computed conditional probabilities for sample 79000 / 182103 [t-SNE] Computed conditional probabilities for sample 80000 / 182103 [t-SNE] Computed conditional probabilities for sample 81000 / 182103 [t-SNE] Computed conditional probabilities for sample 82000 / 182103 [t-SNE] Computed conditional probabilities for sample 83000 / 182103 [t-SNE] Computed conditional probabilities for sample 84000 / 182103 [t-SNE] Computed conditional probabilities for sample 85000 / 182103 [t-SNE] Computed conditional probabilities for sample 86000 / 182103 [t-SNE] Computed conditional probabilities for sample 87000 / 182103 [t-SNE] Computed conditional probabilities for sample 88000 / 182103 [t-SNE] Computed conditional probabilities for sample 89000 / 182103 [t-SNE] Computed conditional probabilities for sample 90000 / 182103 [t-SNE] Computed conditional probabilities for sample 91000 / 182103 [t-SNE] Computed conditional probabilities for sample 92000 / 182103 [t-SNE] Computed conditional probabilities for sample 93000 / 182103 [t-SNE] Computed conditional probabilities for sample 94000 / 182103 [t-SNE] Computed conditional probabilities for sample 95000 / 182103 [t-SNE] Computed conditional probabilities for sample 96000 / 182103 [t-SNE] Computed conditional probabilities for sample 97000 / 182103 [t-SNE] Computed conditional probabilities for sample 98000 / 182103 [t-SNE] Computed conditional probabilities for sample 99000 / 182103 [t-SNE] Computed conditional probabilities for sample 100000 / 182103 [t-SNE] Computed conditional probabilities for sample 101000 / 182103 [t-SNE] Computed conditional probabilities for sample 102000 / 182103 [t-SNE] Computed conditional probabilities for sample 103000 / 182103 [t-SNE] Computed conditional probabilities for sample 104000 / 182103 [t-SNE] Computed conditional probabilities for sample 105000 / 182103 [t-SNE] Computed conditional probabilities for sample 106000 / 182103 [t-SNE] Computed conditional probabilities for sample 107000 / 182103 [t-SNE] Computed conditional probabilities for sample 108000 / 182103 [t-SNE] Computed conditional probabilities for sample 109000 / 182103 [t-SNE] Computed conditional probabilities for sample 110000 / 182103 [t-SNE] Computed conditional probabilities for sample 111000 / 182103 [t-SNE] Computed conditional probabilities for sample 112000 / 182103 [t-SNE] Computed conditional probabilities for sample 113000 / 182103 [t-SNE] Computed conditional probabilities for sample 114000 / 182103 [t-SNE] Computed conditional probabilities for sample 115000 / 182103 [t-SNE] Computed conditional probabilities for sample 116000 / 182103 [t-SNE] Computed conditional probabilities for sample 117000 / 182103 [t-SNE] Computed conditional probabilities for sample 118000 / 182103 [t-SNE] Computed conditional probabilities for sample 119000 / 182103 [t-SNE] Computed conditional probabilities for sample 120000 / 182103 [t-SNE] Computed conditional probabilities for sample 121000 / 182103 [t-SNE] Computed conditional probabilities for sample 122000 / 182103 [t-SNE] Computed conditional probabilities for sample 123000 / 182103 [t-SNE] Computed conditional probabilities for sample 124000 / 182103 [t-SNE] Computed conditional probabilities for sample 125000 / 182103 [t-SNE] Computed conditional probabilities for sample 126000 / 182103 [t-SNE] Computed conditional probabilities for sample 127000 / 182103 [t-SNE] Computed conditional probabilities for sample 128000 / 182103 [t-SNE] Computed conditional probabilities for sample 129000 / 182103 [t-SNE] Computed conditional probabilities for sample 130000 / 182103 [t-SNE] Computed conditional probabilities for sample 131000 / 182103 [t-SNE] Computed conditional probabilities for sample 132000 / 182103 [t-SNE] Computed conditional probabilities for sample 133000 / 182103 [t-SNE] Computed conditional probabilities for sample 134000 / 182103 [t-SNE] Computed conditional probabilities for sample 135000 / 182103 [t-SNE] Computed conditional probabilities for sample 136000 / 182103 [t-SNE] Computed conditional probabilities for sample 137000 / 182103 [t-SNE] Computed conditional probabilities for sample 138000 / 182103 [t-SNE] Computed conditional probabilities for sample 139000 / 182103 [t-SNE] Computed conditional probabilities for sample 140000 / 182103 [t-SNE] Computed conditional probabilities for sample 141000 / 182103 [t-SNE] Computed conditional probabilities for sample 142000 / 182103 [t-SNE] Computed conditional probabilities for sample 143000 / 182103 [t-SNE] Computed conditional probabilities for sample 144000 / 182103 [t-SNE] Computed conditional probabilities for sample 145000 / 182103 [t-SNE] Computed conditional probabilities for sample 146000 / 182103 [t-SNE] Computed conditional probabilities for sample 147000 / 182103 [t-SNE] Computed conditional probabilities for sample 148000 / 182103 [t-SNE] Computed conditional probabilities for sample 149000 / 182103 [t-SNE] Computed conditional probabilities for sample 150000 / 182103 [t-SNE] Computed conditional probabilities for sample 151000 / 182103 [t-SNE] Computed conditional probabilities for sample 152000 / 182103 [t-SNE] Computed conditional probabilities for sample 153000 / 182103 [t-SNE] Computed conditional probabilities for sample 154000 / 182103 [t-SNE] Computed conditional probabilities for sample 155000 / 182103 [t-SNE] Computed conditional probabilities for sample 156000 / 182103 [t-SNE] Computed conditional probabilities for sample 157000 / 182103 [t-SNE] Computed conditional probabilities for sample 158000 / 182103 [t-SNE] Computed conditional probabilities for sample 159000 / 182103 [t-SNE] Computed conditional probabilities for sample 160000 / 182103 [t-SNE] Computed conditional probabilities for sample 161000 / 182103 [t-SNE] Computed conditional probabilities for sample 162000 / 182103 [t-SNE] Computed conditional probabilities for sample 163000 / 182103 [t-SNE] Computed conditional probabilities for sample 164000 / 182103 [t-SNE] Computed conditional probabilities for sample 165000 / 182103 [t-SNE] Computed conditional probabilities for sample 166000 / 182103 [t-SNE] Computed conditional probabilities for sample 167000 / 182103 [t-SNE] Computed conditional probabilities for sample 168000 / 182103 [t-SNE] Computed conditional probabilities for sample 169000 / 182103 [t-SNE] Computed conditional probabilities for sample 170000 / 182103 [t-SNE] Computed conditional probabilities for sample 171000 / 182103 [t-SNE] Computed conditional probabilities for sample 172000 / 182103 [t-SNE] Computed conditional probabilities for sample 173000 / 182103 [t-SNE] Computed conditional probabilities for sample 174000 / 182103 [t-SNE] Computed conditional probabilities for sample 175000 / 182103 [t-SNE] Computed conditional probabilities for sample 176000 / 182103 [t-SNE] Computed conditional probabilities for sample 177000 / 182103 [t-SNE] Computed conditional probabilities for sample 178000 / 182103 [t-SNE] Computed conditional probabilities for sample 179000 / 182103 [t-SNE] Computed conditional probabilities for sample 180000 / 182103 [t-SNE] Computed conditional probabilities for sample 181000 / 182103 [t-SNE] Computed conditional probabilities for sample 182000 / 182103 [t-SNE] Computed conditional probabilities for sample 182103 / 182103 [t-SNE] Mean sigma: 0.000000 [t-SNE] Computed conditional probabilities in 0.464s [t-SNE] Iteration 50: error = 154.6959381, gradient norm = 0.0101671 (50 iterations in 37.328s) [t-SNE] Iteration 100: error = 140.0749512, gradient norm = 0.0054893 (50 iterations in 32.743s) [t-SNE] Iteration 150: error = 134.2613525, gradient norm = 0.0042218 (50 iterations in 32.411s) [t-SNE] Iteration 200: error = 130.5969543, gradient norm = 0.0035314 (50 iterations in 31.834s) [t-SNE] Iteration 250: error = 127.9119263, gradient norm = 0.0031411 (50 iterations in 33.319s) [t-SNE] KL divergence after 250 iterations with early exaggeration: 127.911926 [t-SNE] Iteration 300: error = 7.2592444, gradient norm = 0.0036281 (50 iterations in 31.537s) [t-SNE] Iteration 350: error = 6.3009748, gradient norm = 0.0042364 (50 iterations in 29.055s) [t-SNE] Iteration 400: error = 5.7426353, gradient norm = 0.0032608 (50 iterations in 29.626s) [t-SNE] Iteration 450: error = 5.3896160, gradient norm = 0.0028531 (50 iterations in 30.043s) [t-SNE] Iteration 500: error = 5.1364594, gradient norm = 0.0024780 (50 iterations in 29.295s) [t-SNE] Iteration 550: error = 4.9404240, gradient norm = 0.0022180 (50 iterations in 29.415s) [t-SNE] Iteration 600: error = 4.7806568, gradient norm = 0.0020251 (50 iterations in 30.564s) [t-SNE] Iteration 650: error = 4.6456656, gradient norm = 0.0018775 (50 iterations in 30.091s) [t-SNE] Iteration 700: error = 4.5289412, gradient norm = 0.0017568 (50 iterations in 31.002s) [t-SNE] Iteration 750: error = 4.4264221, gradient norm = 0.0016534 (50 iterations in 30.528s) [t-SNE] Iteration 800: error = 4.3348217, gradient norm = 0.0015726 (50 iterations in 31.178s) [t-SNE] Iteration 850: error = 4.2521553, gradient norm = 0.0014987 (50 iterations in 31.874s) [t-SNE] Iteration 900: error = 4.1768799, gradient norm = 0.0014359 (50 iterations in 31.677s) [t-SNE] Iteration 950: error = 4.1077323, gradient norm = 0.0013779 (50 iterations in 31.706s) [t-SNE] Iteration 1000: error = 4.0436907, gradient norm = 0.0013325 (50 iterations in 30.734s) [t-SNE] KL divergence after 1000 iterations: 4.043691
All good
C:\Users\admin\AppData\Roaming\Python\Python39\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
Saving this K-Means clustering plot as an image in the present working directory...
Done Cluster 0: dream, reading book, presentation, meeting with client, creative writing Cluster 1: office work, training, competition, public speaking, walking meeting Cluster 2: sport, Play Quitar, phone calls, cognitive workout, negotiation, writing study, Coherent Breathing Cluster 3: sales, play piano, leisure
C:\Users\admin\AppData\Roaming\Python\Python39\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
Saving this K-Means clustering plot as an image in the present working directory...
Done Cluster 0: training, public speaking, walking meeting Cluster 1: dream, play piano Cluster 2: office work, sales, competition, leisure Cluster 3: sport, Play Quitar, phone calls, negotiation, writing study Cluster 4: reading book, presentation, cognitive workout, meeting with client, creative writing, Coherent Breathing
C:\Users\admin\AppData\Roaming\Python\Python39\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
Saving this K-Means clustering plot as an image in the present working directory...
Done Cluster 0: reading book, presentation, cognitive workout, meeting with client, creative writing Cluster 1: training, public speaking, walking meeting Cluster 2: office work, sales, competition Cluster 3: phone calls, play piano, leisure, Coherent Breathing Cluster 4: sport, Play Quitar, negotiation, writing study Cluster 5: dream
C:\Users\admin\AppData\Roaming\Python\Python39\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
Saving this K-Means clustering plot as an image in the present working directory...
Done Cluster 0: dream, reading book Cluster 1: play piano, leisure, Coherent Breathing Cluster 2: sport, phone calls, negotiation Cluster 3: office work, competition Cluster 4: training, public speaking, walking meeting Cluster 5: sales Cluster 6: Play Quitar, presentation, cognitive workout, meeting with client, creative writing, writing study
C:\Users\admin\AppData\Roaming\Python\Python39\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
Saving this K-Means clustering plot as an image in the present working directory...
Done Cluster 0: office work, sales Cluster 1: phone calls, play piano, leisure, Coherent Breathing Cluster 2: sport, Play Quitar, negotiation Cluster 3: training, walking meeting Cluster 4: presentation, cognitive workout, meeting with client, creative writing, writing study Cluster 5: Cluster 6: competition, public speaking Cluster 7: dream, reading book [t-SNE] Computing 31 nearest neighbors... [t-SNE] Indexed 182103 samples in 0.008s... [t-SNE] Computed neighbors for 182103 samples in 32.758s... [t-SNE] Computed conditional probabilities for sample 1000 / 182103 [t-SNE] Computed conditional probabilities for sample 2000 / 182103 [t-SNE] Computed conditional probabilities for sample 3000 / 182103 [t-SNE] Computed conditional probabilities for sample 4000 / 182103 [t-SNE] Computed conditional probabilities for sample 5000 / 182103 [t-SNE] Computed conditional probabilities for sample 6000 / 182103 [t-SNE] Computed conditional probabilities for sample 7000 / 182103 [t-SNE] Computed conditional probabilities for sample 8000 / 182103 [t-SNE] Computed conditional probabilities for sample 9000 / 182103 [t-SNE] Computed conditional probabilities for sample 10000 / 182103 [t-SNE] Computed conditional probabilities for sample 11000 / 182103 [t-SNE] Computed conditional probabilities for sample 12000 / 182103 [t-SNE] Computed conditional probabilities for sample 13000 / 182103 [t-SNE] Computed conditional probabilities for sample 14000 / 182103 [t-SNE] Computed conditional probabilities for sample 15000 / 182103 [t-SNE] Computed conditional probabilities for sample 16000 / 182103 [t-SNE] Computed conditional probabilities for sample 17000 / 182103 [t-SNE] Computed conditional probabilities for sample 18000 / 182103 [t-SNE] Computed conditional probabilities for sample 19000 / 182103 [t-SNE] Computed conditional probabilities for sample 20000 / 182103 [t-SNE] Computed conditional probabilities for sample 21000 / 182103 [t-SNE] Computed conditional probabilities for sample 22000 / 182103 [t-SNE] Computed conditional probabilities for sample 23000 / 182103 [t-SNE] Computed conditional probabilities for sample 24000 / 182103 [t-SNE] Computed conditional probabilities for sample 25000 / 182103 [t-SNE] Computed conditional probabilities for sample 26000 / 182103 [t-SNE] Computed conditional probabilities for sample 27000 / 182103 [t-SNE] Computed conditional probabilities for sample 28000 / 182103 [t-SNE] Computed conditional probabilities for sample 29000 / 182103 [t-SNE] Computed conditional probabilities for sample 30000 / 182103 [t-SNE] Computed conditional probabilities for sample 31000 / 182103 [t-SNE] Computed conditional probabilities for sample 32000 / 182103 [t-SNE] Computed conditional probabilities for sample 33000 / 182103 [t-SNE] Computed conditional probabilities for sample 34000 / 182103 [t-SNE] Computed conditional probabilities for sample 35000 / 182103 [t-SNE] Computed conditional probabilities for sample 36000 / 182103 [t-SNE] Computed conditional probabilities for sample 37000 / 182103 [t-SNE] Computed conditional probabilities for sample 38000 / 182103 [t-SNE] Computed conditional probabilities for sample 39000 / 182103 [t-SNE] Computed conditional probabilities for sample 40000 / 182103 [t-SNE] Computed conditional probabilities for sample 41000 / 182103 [t-SNE] Computed conditional probabilities for sample 42000 / 182103 [t-SNE] Computed conditional probabilities for sample 43000 / 182103 [t-SNE] Computed conditional probabilities for sample 44000 / 182103 [t-SNE] Computed conditional probabilities for sample 45000 / 182103 [t-SNE] Computed conditional probabilities for sample 46000 / 182103 [t-SNE] Computed conditional probabilities for sample 47000 / 182103 [t-SNE] Computed conditional probabilities for sample 48000 / 182103 [t-SNE] Computed conditional probabilities for sample 49000 / 182103 [t-SNE] Computed conditional probabilities for sample 50000 / 182103 [t-SNE] Computed conditional probabilities for sample 51000 / 182103 [t-SNE] Computed conditional probabilities for sample 52000 / 182103 [t-SNE] Computed conditional probabilities for sample 53000 / 182103 [t-SNE] Computed conditional probabilities for sample 54000 / 182103 [t-SNE] Computed conditional probabilities for sample 55000 / 182103 [t-SNE] Computed conditional probabilities for sample 56000 / 182103 [t-SNE] Computed conditional probabilities for sample 57000 / 182103 [t-SNE] Computed conditional probabilities for sample 58000 / 182103 [t-SNE] Computed conditional probabilities for sample 59000 / 182103 [t-SNE] Computed conditional probabilities for sample 60000 / 182103 [t-SNE] Computed conditional probabilities for sample 61000 / 182103 [t-SNE] Computed conditional probabilities for sample 62000 / 182103 [t-SNE] Computed conditional probabilities for sample 63000 / 182103 [t-SNE] Computed conditional probabilities for sample 64000 / 182103 [t-SNE] Computed conditional probabilities for sample 65000 / 182103 [t-SNE] Computed conditional probabilities for sample 66000 / 182103 [t-SNE] Computed conditional probabilities for sample 67000 / 182103 [t-SNE] Computed conditional probabilities for sample 68000 / 182103 [t-SNE] Computed conditional probabilities for sample 69000 / 182103 [t-SNE] Computed conditional probabilities for sample 70000 / 182103 [t-SNE] Computed conditional probabilities for sample 71000 / 182103 [t-SNE] Computed conditional probabilities for sample 72000 / 182103 [t-SNE] Computed conditional probabilities for sample 73000 / 182103 [t-SNE] Computed conditional probabilities for sample 74000 / 182103 [t-SNE] Computed conditional probabilities for sample 75000 / 182103 [t-SNE] Computed conditional probabilities for sample 76000 / 182103 [t-SNE] Computed conditional probabilities for sample 77000 / 182103 [t-SNE] Computed conditional probabilities for sample 78000 / 182103 [t-SNE] Computed conditional probabilities for sample 79000 / 182103 [t-SNE] Computed conditional probabilities for sample 80000 / 182103 [t-SNE] Computed conditional probabilities for sample 81000 / 182103 [t-SNE] Computed conditional probabilities for sample 82000 / 182103 [t-SNE] Computed conditional probabilities for sample 83000 / 182103 [t-SNE] Computed conditional probabilities for sample 84000 / 182103 [t-SNE] Computed conditional probabilities for sample 85000 / 182103 [t-SNE] Computed conditional probabilities for sample 86000 / 182103 [t-SNE] Computed conditional probabilities for sample 87000 / 182103 [t-SNE] Computed conditional probabilities for sample 88000 / 182103 [t-SNE] Computed conditional probabilities for sample 89000 / 182103 [t-SNE] Computed conditional probabilities for sample 90000 / 182103 [t-SNE] Computed conditional probabilities for sample 91000 / 182103 [t-SNE] Computed conditional probabilities for sample 92000 / 182103 [t-SNE] Computed conditional probabilities for sample 93000 / 182103 [t-SNE] Computed conditional probabilities for sample 94000 / 182103 [t-SNE] Computed conditional probabilities for sample 95000 / 182103 [t-SNE] Computed conditional probabilities for sample 96000 / 182103 [t-SNE] Computed conditional probabilities for sample 97000 / 182103 [t-SNE] Computed conditional probabilities for sample 98000 / 182103 [t-SNE] Computed conditional probabilities for sample 99000 / 182103 [t-SNE] Computed conditional probabilities for sample 100000 / 182103 [t-SNE] Computed conditional probabilities for sample 101000 / 182103 [t-SNE] Computed conditional probabilities for sample 102000 / 182103 [t-SNE] Computed conditional probabilities for sample 103000 / 182103 [t-SNE] Computed conditional probabilities for sample 104000 / 182103 [t-SNE] Computed conditional probabilities for sample 105000 / 182103 [t-SNE] Computed conditional probabilities for sample 106000 / 182103 [t-SNE] Computed conditional probabilities for sample 107000 / 182103 [t-SNE] Computed conditional probabilities for sample 108000 / 182103 [t-SNE] Computed conditional probabilities for sample 109000 / 182103 [t-SNE] Computed conditional probabilities for sample 110000 / 182103 [t-SNE] Computed conditional probabilities for sample 111000 / 182103 [t-SNE] Computed conditional probabilities for sample 112000 / 182103 [t-SNE] Computed conditional probabilities for sample 113000 / 182103 [t-SNE] Computed conditional probabilities for sample 114000 / 182103 [t-SNE] Computed conditional probabilities for sample 115000 / 182103 [t-SNE] Computed conditional probabilities for sample 116000 / 182103 [t-SNE] Computed conditional probabilities for sample 117000 / 182103 [t-SNE] Computed conditional probabilities for sample 118000 / 182103 [t-SNE] Computed conditional probabilities for sample 119000 / 182103 [t-SNE] Computed conditional probabilities for sample 120000 / 182103 [t-SNE] Computed conditional probabilities for sample 121000 / 182103 [t-SNE] Computed conditional probabilities for sample 122000 / 182103 [t-SNE] Computed conditional probabilities for sample 123000 / 182103 [t-SNE] Computed conditional probabilities for sample 124000 / 182103 [t-SNE] Computed conditional probabilities for sample 125000 / 182103 [t-SNE] Computed conditional probabilities for sample 126000 / 182103 [t-SNE] Computed conditional probabilities for sample 127000 / 182103 [t-SNE] Computed conditional probabilities for sample 128000 / 182103 [t-SNE] Computed conditional probabilities for sample 129000 / 182103 [t-SNE] Computed conditional probabilities for sample 130000 / 182103 [t-SNE] Computed conditional probabilities for sample 131000 / 182103 [t-SNE] Computed conditional probabilities for sample 132000 / 182103 [t-SNE] Computed conditional probabilities for sample 133000 / 182103 [t-SNE] Computed conditional probabilities for sample 134000 / 182103 [t-SNE] Computed conditional probabilities for sample 135000 / 182103 [t-SNE] Computed conditional probabilities for sample 136000 / 182103 [t-SNE] Computed conditional probabilities for sample 137000 / 182103 [t-SNE] Computed conditional probabilities for sample 138000 / 182103 [t-SNE] Computed conditional probabilities for sample 139000 / 182103 [t-SNE] Computed conditional probabilities for sample 140000 / 182103 [t-SNE] Computed conditional probabilities for sample 141000 / 182103 [t-SNE] Computed conditional probabilities for sample 142000 / 182103 [t-SNE] Computed conditional probabilities for sample 143000 / 182103 [t-SNE] Computed conditional probabilities for sample 144000 / 182103 [t-SNE] Computed conditional probabilities for sample 145000 / 182103 [t-SNE] Computed conditional probabilities for sample 146000 / 182103 [t-SNE] Computed conditional probabilities for sample 147000 / 182103 [t-SNE] Computed conditional probabilities for sample 148000 / 182103 [t-SNE] Computed conditional probabilities for sample 149000 / 182103 [t-SNE] Computed conditional probabilities for sample 150000 / 182103 [t-SNE] Computed conditional probabilities for sample 151000 / 182103 [t-SNE] Computed conditional probabilities for sample 152000 / 182103 [t-SNE] Computed conditional probabilities for sample 153000 / 182103 [t-SNE] Computed conditional probabilities for sample 154000 / 182103 [t-SNE] Computed conditional probabilities for sample 155000 / 182103 [t-SNE] Computed conditional probabilities for sample 156000 / 182103 [t-SNE] Computed conditional probabilities for sample 157000 / 182103 [t-SNE] Computed conditional probabilities for sample 158000 / 182103 [t-SNE] Computed conditional probabilities for sample 159000 / 182103 [t-SNE] Computed conditional probabilities for sample 160000 / 182103 [t-SNE] Computed conditional probabilities for sample 161000 / 182103 [t-SNE] Computed conditional probabilities for sample 162000 / 182103 [t-SNE] Computed conditional probabilities for sample 163000 / 182103 [t-SNE] Computed conditional probabilities for sample 164000 / 182103 [t-SNE] Computed conditional probabilities for sample 165000 / 182103 [t-SNE] Computed conditional probabilities for sample 166000 / 182103 [t-SNE] Computed conditional probabilities for sample 167000 / 182103 [t-SNE] Computed conditional probabilities for sample 168000 / 182103 [t-SNE] Computed conditional probabilities for sample 169000 / 182103 [t-SNE] Computed conditional probabilities for sample 170000 / 182103 [t-SNE] Computed conditional probabilities for sample 171000 / 182103 [t-SNE] Computed conditional probabilities for sample 172000 / 182103 [t-SNE] Computed conditional probabilities for sample 173000 / 182103 [t-SNE] Computed conditional probabilities for sample 174000 / 182103 [t-SNE] Computed conditional probabilities for sample 175000 / 182103 [t-SNE] Computed conditional probabilities for sample 176000 / 182103 [t-SNE] Computed conditional probabilities for sample 177000 / 182103 [t-SNE] Computed conditional probabilities for sample 178000 / 182103 [t-SNE] Computed conditional probabilities for sample 179000 / 182103 [t-SNE] Computed conditional probabilities for sample 180000 / 182103 [t-SNE] Computed conditional probabilities for sample 181000 / 182103 [t-SNE] Computed conditional probabilities for sample 182000 / 182103 [t-SNE] Computed conditional probabilities for sample 182103 / 182103 [t-SNE] Mean sigma: 0.095154 [t-SNE] Computed conditional probabilities in 1.343s [t-SNE] Iteration 50: error = 130.2186279, gradient norm = 0.0068379 (50 iterations in 31.991s) [t-SNE] Iteration 100: error = 118.3221436, gradient norm = 0.0034919 (50 iterations in 29.246s) [t-SNE] Iteration 150: error = 113.3481293, gradient norm = 0.0026126 (50 iterations in 29.785s) [t-SNE] Iteration 200: error = 110.1855774, gradient norm = 0.0021404 (50 iterations in 29.910s) [t-SNE] Iteration 250: error = 107.8773651, gradient norm = 0.0018693 (50 iterations in 28.777s) [t-SNE] KL divergence after 250 iterations with early exaggeration: 107.877365 [t-SNE] Iteration 300: error = 5.7288256, gradient norm = 0.0033903 (50 iterations in 27.513s) [t-SNE] Iteration 350: error = 4.8727365, gradient norm = 0.0038578 (50 iterations in 27.144s) [t-SNE] Iteration 400: error = 4.2934804, gradient norm = 0.0039558 (50 iterations in 28.976s) [t-SNE] Iteration 450: error = 3.9049976, gradient norm = 0.0034865 (50 iterations in 27.627s) [t-SNE] Iteration 500: error = 3.6421838, gradient norm = 0.0030486 (50 iterations in 26.658s) [t-SNE] Iteration 550: error = 3.4481370, gradient norm = 0.0027013 (50 iterations in 27.296s) [t-SNE] Iteration 600: error = 3.2955711, gradient norm = 0.0024437 (50 iterations in 28.187s) [t-SNE] Iteration 650: error = 3.1708593, gradient norm = 0.0022452 (50 iterations in 27.406s) [t-SNE] Iteration 700: error = 3.0657768, gradient norm = 0.0020874 (50 iterations in 26.909s) [t-SNE] Iteration 750: error = 2.9752152, gradient norm = 0.0019567 (50 iterations in 26.792s) [t-SNE] Iteration 800: error = 2.8959646, gradient norm = 0.0018459 (50 iterations in 27.070s) [t-SNE] Iteration 850: error = 2.8255429, gradient norm = 0.0017523 (50 iterations in 27.491s) [t-SNE] Iteration 900: error = 2.7620783, gradient norm = 0.0016726 (50 iterations in 27.255s) [t-SNE] Iteration 950: error = 2.7046220, gradient norm = 0.0016021 (50 iterations in 27.084s) [t-SNE] Iteration 1000: error = 2.6522570, gradient norm = 0.0015376 (50 iterations in 26.348s) [t-SNE] KL divergence after 1000 iterations: 2.652257
All good
C:\Users\admin\AppData\Roaming\Python\Python39\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
Saving this K-Means clustering plot as an image in the present working directory...
Done Cluster 0: dream, reading book, meeting with client, Coherent Breathing Cluster 1: sales, play piano, leisure Cluster 2: office work, training, competition, public speaking, walking meeting Cluster 3: sport, Play Quitar, presentation, phone calls, cognitive workout, negotiation, creative writing, writing study
C:\Users\admin\AppData\Roaming\Python\Python39\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
Saving this K-Means clustering plot as an image in the present working directory...
Done Cluster 0: office work, competition, public speaking Cluster 1: sport, phone calls, training, negotiation, writing study, walking meeting Cluster 2: dream, Coherent Breathing Cluster 3: reading book, Play Quitar, presentation, cognitive workout, meeting with client, creative writing Cluster 4: sales, play piano, leisure
C:\Users\admin\AppData\Roaming\Python\Python39\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
Saving this K-Means clustering plot as an image in the present working directory...
Done Cluster 0: office work, competition, public speaking Cluster 1: leisure, walking meeting, Coherent Breathing Cluster 2: sales, play piano Cluster 3: dream Cluster 4: reading book, Play Quitar, presentation, cognitive workout, meeting with client, creative writing Cluster 5: sport, phone calls, training, negotiation, writing study
C:\Users\admin\AppData\Roaming\Python\Python39\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
Saving this K-Means clustering plot as an image in the present working directory...
Done Cluster 0: walking meeting, Coherent Breathing Cluster 1: training, public speaking Cluster 2: Play Quitar, presentation, cognitive workout, meeting with client, creative writing Cluster 3: dream, reading book Cluster 4: play piano, leisure Cluster 5: sport, phone calls, negotiation, writing study Cluster 6: office work, sales, competition
C:\Users\admin\AppData\Roaming\Python\Python39\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
Saving this K-Means clustering plot as an image in the present working directory...
Done Cluster 0: office work, sales Cluster 1: dream Cluster 2: Play Quitar, negotiation, writing study Cluster 3: sport, walking meeting, Coherent Breathing Cluster 4: phone calls, training Cluster 5: play piano, leisure Cluster 6: competition, public speaking Cluster 7: reading book, presentation, cognitive workout, meeting with client, creative writing [t-SNE] Computing 61 nearest neighbors... [t-SNE] Indexed 182103 samples in 0.009s... [t-SNE] Computed neighbors for 182103 samples in 33.470s... [t-SNE] Computed conditional probabilities for sample 1000 / 182103 [t-SNE] Computed conditional probabilities for sample 2000 / 182103 [t-SNE] Computed conditional probabilities for sample 3000 / 182103 [t-SNE] Computed conditional probabilities for sample 4000 / 182103 [t-SNE] Computed conditional probabilities for sample 5000 / 182103 [t-SNE] Computed conditional probabilities for sample 6000 / 182103 [t-SNE] Computed conditional probabilities for sample 7000 / 182103 [t-SNE] Computed conditional probabilities for sample 8000 / 182103 [t-SNE] Computed conditional probabilities for sample 9000 / 182103 [t-SNE] Computed conditional probabilities for sample 10000 / 182103 [t-SNE] Computed conditional probabilities for sample 11000 / 182103 [t-SNE] Computed conditional probabilities for sample 12000 / 182103 [t-SNE] Computed conditional probabilities for sample 13000 / 182103 [t-SNE] Computed conditional probabilities for sample 14000 / 182103 [t-SNE] Computed conditional probabilities for sample 15000 / 182103 [t-SNE] Computed conditional probabilities for sample 16000 / 182103 [t-SNE] Computed conditional probabilities for sample 17000 / 182103 [t-SNE] Computed conditional probabilities for sample 18000 / 182103 [t-SNE] Computed conditional probabilities for sample 19000 / 182103 [t-SNE] Computed conditional probabilities for sample 20000 / 182103 [t-SNE] Computed conditional probabilities for sample 21000 / 182103 [t-SNE] Computed conditional probabilities for sample 22000 / 182103 [t-SNE] Computed conditional probabilities for sample 23000 / 182103 [t-SNE] Computed conditional probabilities for sample 24000 / 182103 [t-SNE] Computed conditional probabilities for sample 25000 / 182103 [t-SNE] Computed conditional probabilities for sample 26000 / 182103 [t-SNE] Computed conditional probabilities for sample 27000 / 182103 [t-SNE] Computed conditional probabilities for sample 28000 / 182103 [t-SNE] Computed conditional probabilities for sample 29000 / 182103 [t-SNE] Computed conditional probabilities for sample 30000 / 182103 [t-SNE] Computed conditional probabilities for sample 31000 / 182103 [t-SNE] Computed conditional probabilities for sample 32000 / 182103 [t-SNE] Computed conditional probabilities for sample 33000 / 182103 [t-SNE] Computed conditional probabilities for sample 34000 / 182103 [t-SNE] Computed conditional probabilities for sample 35000 / 182103 [t-SNE] Computed conditional probabilities for sample 36000 / 182103 [t-SNE] Computed conditional probabilities for sample 37000 / 182103 [t-SNE] Computed conditional probabilities for sample 38000 / 182103 [t-SNE] Computed conditional probabilities for sample 39000 / 182103 [t-SNE] Computed conditional probabilities for sample 40000 / 182103 [t-SNE] Computed conditional probabilities for sample 41000 / 182103 [t-SNE] Computed conditional probabilities for sample 42000 / 182103 [t-SNE] Computed conditional probabilities for sample 43000 / 182103 [t-SNE] Computed conditional probabilities for sample 44000 / 182103 [t-SNE] Computed conditional probabilities for sample 45000 / 182103 [t-SNE] Computed conditional probabilities for sample 46000 / 182103 [t-SNE] Computed conditional probabilities for sample 47000 / 182103 [t-SNE] Computed conditional probabilities for sample 48000 / 182103 [t-SNE] Computed conditional probabilities for sample 49000 / 182103 [t-SNE] Computed conditional probabilities for sample 50000 / 182103 [t-SNE] Computed conditional probabilities for sample 51000 / 182103 [t-SNE] Computed conditional probabilities for sample 52000 / 182103 [t-SNE] Computed conditional probabilities for sample 53000 / 182103 [t-SNE] Computed conditional probabilities for sample 54000 / 182103 [t-SNE] Computed conditional probabilities for sample 55000 / 182103 [t-SNE] Computed conditional probabilities for sample 56000 / 182103 [t-SNE] Computed conditional probabilities for sample 57000 / 182103 [t-SNE] Computed conditional probabilities for sample 58000 / 182103 [t-SNE] Computed conditional probabilities for sample 59000 / 182103 [t-SNE] Computed conditional probabilities for sample 60000 / 182103 [t-SNE] Computed conditional probabilities for sample 61000 / 182103 [t-SNE] Computed conditional probabilities for sample 62000 / 182103 [t-SNE] Computed conditional probabilities for sample 63000 / 182103 [t-SNE] Computed conditional probabilities for sample 64000 / 182103 [t-SNE] Computed conditional probabilities for sample 65000 / 182103 [t-SNE] Computed conditional probabilities for sample 66000 / 182103 [t-SNE] Computed conditional probabilities for sample 67000 / 182103 [t-SNE] Computed conditional probabilities for sample 68000 / 182103 [t-SNE] Computed conditional probabilities for sample 69000 / 182103 [t-SNE] Computed conditional probabilities for sample 70000 / 182103 [t-SNE] Computed conditional probabilities for sample 71000 / 182103 [t-SNE] Computed conditional probabilities for sample 72000 / 182103 [t-SNE] Computed conditional probabilities for sample 73000 / 182103 [t-SNE] Computed conditional probabilities for sample 74000 / 182103 [t-SNE] Computed conditional probabilities for sample 75000 / 182103 [t-SNE] Computed conditional probabilities for sample 76000 / 182103 [t-SNE] Computed conditional probabilities for sample 77000 / 182103 [t-SNE] Computed conditional probabilities for sample 78000 / 182103 [t-SNE] Computed conditional probabilities for sample 79000 / 182103 [t-SNE] Computed conditional probabilities for sample 80000 / 182103 [t-SNE] Computed conditional probabilities for sample 81000 / 182103 [t-SNE] Computed conditional probabilities for sample 82000 / 182103 [t-SNE] Computed conditional probabilities for sample 83000 / 182103 [t-SNE] Computed conditional probabilities for sample 84000 / 182103 [t-SNE] Computed conditional probabilities for sample 85000 / 182103 [t-SNE] Computed conditional probabilities for sample 86000 / 182103 [t-SNE] Computed conditional probabilities for sample 87000 / 182103 [t-SNE] Computed conditional probabilities for sample 88000 / 182103 [t-SNE] Computed conditional probabilities for sample 89000 / 182103 [t-SNE] Computed conditional probabilities for sample 90000 / 182103 [t-SNE] Computed conditional probabilities for sample 91000 / 182103 [t-SNE] Computed conditional probabilities for sample 92000 / 182103 [t-SNE] Computed conditional probabilities for sample 93000 / 182103 [t-SNE] Computed conditional probabilities for sample 94000 / 182103 [t-SNE] Computed conditional probabilities for sample 95000 / 182103 [t-SNE] Computed conditional probabilities for sample 96000 / 182103 [t-SNE] Computed conditional probabilities for sample 97000 / 182103 [t-SNE] Computed conditional probabilities for sample 98000 / 182103 [t-SNE] Computed conditional probabilities for sample 99000 / 182103 [t-SNE] Computed conditional probabilities for sample 100000 / 182103 [t-SNE] Computed conditional probabilities for sample 101000 / 182103 [t-SNE] Computed conditional probabilities for sample 102000 / 182103 [t-SNE] Computed conditional probabilities for sample 103000 / 182103 [t-SNE] Computed conditional probabilities for sample 104000 / 182103 [t-SNE] Computed conditional probabilities for sample 105000 / 182103 [t-SNE] Computed conditional probabilities for sample 106000 / 182103 [t-SNE] Computed conditional probabilities for sample 107000 / 182103 [t-SNE] Computed conditional probabilities for sample 108000 / 182103 [t-SNE] Computed conditional probabilities for sample 109000 / 182103 [t-SNE] Computed conditional probabilities for sample 110000 / 182103 [t-SNE] Computed conditional probabilities for sample 111000 / 182103 [t-SNE] Computed conditional probabilities for sample 112000 / 182103 [t-SNE] Computed conditional probabilities for sample 113000 / 182103 [t-SNE] Computed conditional probabilities for sample 114000 / 182103 [t-SNE] Computed conditional probabilities for sample 115000 / 182103 [t-SNE] Computed conditional probabilities for sample 116000 / 182103 [t-SNE] Computed conditional probabilities for sample 117000 / 182103 [t-SNE] Computed conditional probabilities for sample 118000 / 182103 [t-SNE] Computed conditional probabilities for sample 119000 / 182103 [t-SNE] Computed conditional probabilities for sample 120000 / 182103 [t-SNE] Computed conditional probabilities for sample 121000 / 182103 [t-SNE] Computed conditional probabilities for sample 122000 / 182103 [t-SNE] Computed conditional probabilities for sample 123000 / 182103 [t-SNE] Computed conditional probabilities for sample 124000 / 182103 [t-SNE] Computed conditional probabilities for sample 125000 / 182103 [t-SNE] Computed conditional probabilities for sample 126000 / 182103 [t-SNE] Computed conditional probabilities for sample 127000 / 182103 [t-SNE] Computed conditional probabilities for sample 128000 / 182103 [t-SNE] Computed conditional probabilities for sample 129000 / 182103 [t-SNE] Computed conditional probabilities for sample 130000 / 182103 [t-SNE] Computed conditional probabilities for sample 131000 / 182103 [t-SNE] Computed conditional probabilities for sample 132000 / 182103 [t-SNE] Computed conditional probabilities for sample 133000 / 182103 [t-SNE] Computed conditional probabilities for sample 134000 / 182103 [t-SNE] Computed conditional probabilities for sample 135000 / 182103 [t-SNE] Computed conditional probabilities for sample 136000 / 182103 [t-SNE] Computed conditional probabilities for sample 137000 / 182103 [t-SNE] Computed conditional probabilities for sample 138000 / 182103 [t-SNE] Computed conditional probabilities for sample 139000 / 182103 [t-SNE] Computed conditional probabilities for sample 140000 / 182103 [t-SNE] Computed conditional probabilities for sample 141000 / 182103 [t-SNE] Computed conditional probabilities for sample 142000 / 182103 [t-SNE] Computed conditional probabilities for sample 143000 / 182103 [t-SNE] Computed conditional probabilities for sample 144000 / 182103 [t-SNE] Computed conditional probabilities for sample 145000 / 182103 [t-SNE] Computed conditional probabilities for sample 146000 / 182103 [t-SNE] Computed conditional probabilities for sample 147000 / 182103 [t-SNE] Computed conditional probabilities for sample 148000 / 182103 [t-SNE] Computed conditional probabilities for sample 149000 / 182103 [t-SNE] Computed conditional probabilities for sample 150000 / 182103 [t-SNE] Computed conditional probabilities for sample 151000 / 182103 [t-SNE] Computed conditional probabilities for sample 152000 / 182103 [t-SNE] Computed conditional probabilities for sample 153000 / 182103 [t-SNE] Computed conditional probabilities for sample 154000 / 182103 [t-SNE] Computed conditional probabilities for sample 155000 / 182103 [t-SNE] Computed conditional probabilities for sample 156000 / 182103 [t-SNE] Computed conditional probabilities for sample 157000 / 182103 [t-SNE] Computed conditional probabilities for sample 158000 / 182103 [t-SNE] Computed conditional probabilities for sample 159000 / 182103 [t-SNE] Computed conditional probabilities for sample 160000 / 182103 [t-SNE] Computed conditional probabilities for sample 161000 / 182103 [t-SNE] Computed conditional probabilities for sample 162000 / 182103 [t-SNE] Computed conditional probabilities for sample 163000 / 182103 [t-SNE] Computed conditional probabilities for sample 164000 / 182103 [t-SNE] Computed conditional probabilities for sample 165000 / 182103 [t-SNE] Computed conditional probabilities for sample 166000 / 182103 [t-SNE] Computed conditional probabilities for sample 167000 / 182103 [t-SNE] Computed conditional probabilities for sample 168000 / 182103 [t-SNE] Computed conditional probabilities for sample 169000 / 182103 [t-SNE] Computed conditional probabilities for sample 170000 / 182103 [t-SNE] Computed conditional probabilities for sample 171000 / 182103 [t-SNE] Computed conditional probabilities for sample 172000 / 182103 [t-SNE] Computed conditional probabilities for sample 173000 / 182103 [t-SNE] Computed conditional probabilities for sample 174000 / 182103 [t-SNE] Computed conditional probabilities for sample 175000 / 182103 [t-SNE] Computed conditional probabilities for sample 176000 / 182103 [t-SNE] Computed conditional probabilities for sample 177000 / 182103 [t-SNE] Computed conditional probabilities for sample 178000 / 182103 [t-SNE] Computed conditional probabilities for sample 179000 / 182103 [t-SNE] Computed conditional probabilities for sample 180000 / 182103 [t-SNE] Computed conditional probabilities for sample 181000 / 182103 [t-SNE] Computed conditional probabilities for sample 182000 / 182103 [t-SNE] Computed conditional probabilities for sample 182103 / 182103 [t-SNE] Mean sigma: 0.128359 [t-SNE] Computed conditional probabilities in 2.553s [t-SNE] Iteration 50: error = 120.5461655, gradient norm = 0.0069998 (50 iterations in 36.015s) [t-SNE] Iteration 100: error = 109.5673752, gradient norm = 0.0031966 (50 iterations in 34.577s) [t-SNE] Iteration 150: error = 105.0031281, gradient norm = 0.0023492 (50 iterations in 34.000s) [t-SNE] Iteration 200: error = 102.1278381, gradient norm = 0.0019195 (50 iterations in 33.341s) [t-SNE] Iteration 250: error = 100.0474701, gradient norm = 0.0016436 (50 iterations in 34.522s) [t-SNE] KL divergence after 250 iterations with early exaggeration: 100.047470 [t-SNE] Iteration 300: error = 5.0826817, gradient norm = 0.0033160 (50 iterations in 32.234s) [t-SNE] Iteration 350: error = 4.2675004, gradient norm = 0.0037464 (50 iterations in 31.711s) [t-SNE] Iteration 400: error = 3.7256515, gradient norm = 0.0037670 (50 iterations in 31.582s) [t-SNE] Iteration 450: error = 3.3483038, gradient norm = 0.0036531 (50 iterations in 31.998s) [t-SNE] Iteration 500: error = 3.0880563, gradient norm = 0.0032302 (50 iterations in 31.454s) [t-SNE] Iteration 550: error = 2.9004192, gradient norm = 0.0028644 (50 iterations in 31.480s) [t-SNE] Iteration 600: error = 2.7558663, gradient norm = 0.0025875 (50 iterations in 31.099s) [t-SNE] Iteration 650: error = 2.6395385, gradient norm = 0.0023642 (50 iterations in 31.918s) [t-SNE] Iteration 700: error = 2.5432289, gradient norm = 0.0021936 (50 iterations in 31.221s) [t-SNE] Iteration 750: error = 2.4613037, gradient norm = 0.0020477 (50 iterations in 30.994s) [t-SNE] Iteration 800: error = 2.3904257, gradient norm = 0.0019291 (50 iterations in 31.273s) [t-SNE] Iteration 850: error = 2.3281329, gradient norm = 0.0018288 (50 iterations in 31.887s) [t-SNE] Iteration 900: error = 2.2726972, gradient norm = 0.0017378 (50 iterations in 31.452s) [t-SNE] Iteration 950: error = 2.2229660, gradient norm = 0.0016626 (50 iterations in 31.186s) [t-SNE] Iteration 1000: error = 2.1780040, gradient norm = 0.0015935 (50 iterations in 31.609s) [t-SNE] KL divergence after 1000 iterations: 2.178004
All good
C:\Users\admin\AppData\Roaming\Python\Python39\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
Saving this K-Means clustering plot as an image in the present working directory...
Done Cluster 0: office work, training, competition, public speaking, walking meeting Cluster 1: dream, reading book, presentation, meeting with client, creative writing, Coherent Breathing Cluster 2: sales, play piano, leisure Cluster 3: sport, Play Quitar, phone calls, cognitive workout, negotiation, writing study
C:\Users\admin\AppData\Roaming\Python\Python39\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
Saving this K-Means clustering plot as an image in the present working directory...
Done Cluster 0: sales, play piano, leisure Cluster 1: sport, Play Quitar, phone calls, cognitive workout, negotiation, writing study Cluster 2: dream, competition, Coherent Breathing Cluster 3: office work, training, public speaking, walking meeting Cluster 4: reading book, presentation, meeting with client, creative writing
C:\Users\admin\AppData\Roaming\Python\Python39\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
Saving this K-Means clustering plot as an image in the present working directory...
Done Cluster 0: office work, training, public speaking Cluster 1: reading book, presentation, cognitive workout, meeting with client, creative writing Cluster 2: sales, play piano, leisure Cluster 3: Play Quitar, phone calls, negotiation, writing study Cluster 4: sport, competition, walking meeting, Coherent Breathing Cluster 5: dream
C:\Users\admin\AppData\Roaming\Python\Python39\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
Saving this K-Means clustering plot as an image in the present working directory...
Done Cluster 0: reading book, meeting with client Cluster 1: sport, walking meeting, Coherent Breathing Cluster 2: Play Quitar, presentation, cognitive workout, negotiation, creative writing Cluster 3: phone calls, training, writing study Cluster 4: sales, play piano, leisure Cluster 5: dream Cluster 6: office work, competition, public speaking
C:\Users\admin\AppData\Roaming\Python\Python39\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
Saving this K-Means clustering plot as an image in the present working directory...
Done Cluster 0: public speaking Cluster 1: sport, competition, walking meeting, Coherent Breathing Cluster 2: play piano, leisure Cluster 3: reading book, presentation, meeting with client, creative writing Cluster 4: Play Quitar, cognitive workout, negotiation, writing study Cluster 5: office work, sales Cluster 6: dream Cluster 7: phone calls, training
cluster_dict = {
"office work": 0, "training": 0, "public speaking": 0,
"reading book": 1, "presentation": 1, "cognitive workout": 1, "meeting with client": 1, "creative writing": 1,
"sales": 2, "play piano": 2, "leisure": 2,
"Play Quitar": 3, "phone calls": 3, "negotiation": 3, "writing study": 3,
"sport": 4, "competition": 4, "walking meeting": 4, "Coherent Breathing": 4,
"dream": 5
}
X_train_scaled['Cluster'] = y_train['Activities Detailed'].map(cluster_dict)
X_train_scaled
HR | BR | Posture | Activity | HRV | CoreTemp | ImpulseLoad | WalkSteps | RunSteps | PeakAccelPhi | peakAccelTheta | Stress_Binary | Impact_Binary | Av_Binary | Acc_Binary | Bounds_Binary | Cluster | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
194779 | -0.900290 | 1.463172 | 0.706214 | -0.222442 | 0.000041 | 0.298850 | -0.749086 | -0.590786 | -0.858415 | 0.871294 | -0.381234 | 1 | 0 | 0 | 1 | 0 | 4 |
144310 | -1.460079 | 0.086207 | 0.002183 | -0.222442 | 1.715629 | -1.336719 | -0.091310 | -0.412061 | 0.119176 | -1.970502 | 1.179662 | 1 | 1 | 0 | 1 | 0 | 5 |
220145 | -0.775892 | 0.636993 | 1.552896 | -0.222442 | 0.417740 | 0.707743 | -0.483764 | -0.371043 | -0.858415 | 0.169616 | -0.542706 | 1 | 0 | 0 | 1 | 0 | 4 |
141697 | -0.775892 | 0.086207 | 0.002407 | -0.222442 | -0.717913 | -0.518934 | -0.256031 | -0.423781 | 0.119176 | -1.514411 | -0.865650 | 1 | 1 | 0 | 1 | 0 | 5 |
214535 | 0.592482 | 0.912386 | 1.801920 | -0.222442 | -0.839591 | 0.298850 | -0.883957 | -0.728492 | -0.369620 | -0.040887 | -0.506823 | 1 | 0 | 0 | 1 | 0 | 4 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
49582 | 1.587663 | 0.636993 | -0.289882 | 0.470511 | -0.150087 | 1.525527 | 0.000052 | 0.000328 | -0.037315 | 0.625707 | 1.098926 | 1 | 1 | 0 | 1 | 0 | 4 |
173083 | 0.468084 | -1.841545 | -1.186368 | 0.239527 | 0.417740 | 0.298850 | -1.013301 | -0.857408 | -1.347211 | -0.075971 | 1.233486 | 1 | 0 | 0 | 1 | 0 | 1 |
15965 | -0.153904 | 0.361600 | 0.905433 | -0.222442 | -0.717913 | -0.110042 | 0.956708 | 1.313657 | 1.585563 | 0.660791 | -0.659325 | 0 | 0 | 0 | 1 | 0 | 0 |
43886 | 1.027873 | 1.187779 | 1.552896 | 0.540247 | 2.202338 | 0.707743 | -0.000015 | 0.000591 | -0.048939 | 0.695875 | -1.646098 | 1 | 0 | 0 | 1 | 1 | 4 |
131493 | 1.712060 | -0.464580 | 0.307776 | 0.470511 | -0.758472 | -0.067477 | 0.444859 | 0.346786 | 2.074358 | 1.046714 | 0.623480 | 1 | 1 | 0 | 1 | 0 | 0 |
182103 rows × 17 columns
import scipy.stats as stats
# Let's assume df is your original dataframe and 'Cluster' is the column where you have stored cluster labels
features = X_train_scaled.columns.difference(['Cluster'])
for feature in features:
clusters = X_train_scaled['Cluster'].unique()
d_data = [X_train_scaled[X_train_scaled['Cluster']==cluster][feature] for cluster in clusters]
f_val, p_val = stats.f_oneway(*d_data)
print("ANOVA test for feature '{}': F={}, p={}".format(feature, f_val, p_val))
# The p-value in an ANOVA test tells us about the null hypothesis — whether the differences between the group means happen due to chance or not. A low p-value (usually <0.05) indicates that we can reject the null hypothesis, implying there is a significant difference between the group means.
# In your case, for the feature 'Activity_harmonic_10_fundamental_period', you have a very high p-value of 0.9902265859013497. This p-value is far greater than the typical threshold of 0.05, so we fail to reject the null hypothesis.
# In other words, the ANOVA test does not provide enough evidence to say that the mean value of 'Activity_harmonic_10_fundamental_period' is significantly different across the clusters. This suggests that this particular feature does not contribute significantly to the cluster membership.
# However, remember that ANOVA is making a lot of assumptions, like the data being normally distributed and the variance being the same for all groups (homoscedasticity). If these assumptions are violated, the p-values might not be reliable. It's always a good idea to use additional methods (like data visualization or post-hoc tests) to confirm these findings.
C:\Users\admin\AppData\Roaming\Python\Python39\site-packages\scipy\stats\_stats_py.py:3895: ConstantInputWarning: Each of the input arrays is constant;the F statistic is not defined or infinite warnings.warn(stats.ConstantInputWarning(msg))
ANOVA test for feature 'Acc_Binary': F=nan, p=nan ANOVA test for feature 'Activity': F=369.4727090379749, p=0.0 ANOVA test for feature 'Av_Binary': F=2091.356849449859, p=0.0 ANOVA test for feature 'BR': F=2829.2997638577976, p=0.0 ANOVA test for feature 'Bounds_Binary': F=10951.69990363632, p=0.0 ANOVA test for feature 'CoreTemp': F=60297.55377478051, p=0.0 ANOVA test for feature 'HR': F=27672.42909540929, p=0.0 ANOVA test for feature 'HRV': F=11901.972500035265, p=0.0 ANOVA test for feature 'Impact_Binary': F=31276.65756803394, p=0.0 ANOVA test for feature 'ImpulseLoad': F=13571.616906896052, p=0.0 ANOVA test for feature 'PeakAccelPhi': F=125969.34055534113, p=0.0 ANOVA test for feature 'Posture': F=4495.253069368258, p=0.0 ANOVA test for feature 'RunSteps': F=7538.734647087258, p=0.0 ANOVA test for feature 'Stress_Binary': F=85912.86259031772, p=0.0 ANOVA test for feature 'WalkSteps': F=18894.501819866484, p=0.0 ANOVA test for feature 'peakAccelTheta': F=2698.802121783843, p=0.0
import pandas as pd
import scipy.stats as stats
# Let's assume df is your original dataframe and 'Cluster' is the column where you have stored cluster labels
features = X_train_scaled.columns.difference(['Cluster'])
# Create a DataFrame to store the ANOVA results
anova_results = pd.DataFrame(columns=['Feature', 'F', 'p'])
for feature in features:
clusters = X_train_scaled['Cluster'].unique()
d_data = [X_train_scaled[X_train_scaled['Cluster']==cluster][feature] for cluster in clusters]
f_val, p_val = stats.f_oneway(*d_data)
# Append the results to the DataFrame
row_df = pd.DataFrame({'Feature': [feature], 'F': [f_val], 'p': [p_val]})
anova_results = pd.concat([anova_results, row_df], ignore_index=True)
# Show the results
anova_results.sort_values(by = 'p', ascending=True).fillna(-1)
C:\Users\admin\AppData\Roaming\Python\Python39\site-packages\scipy\stats\_stats_py.py:3895: ConstantInputWarning: Each of the input arrays is constant;the F statistic is not defined or infinite warnings.warn(stats.ConstantInputWarning(msg))
Feature | F | p | |
---|---|---|---|
1 | Activity | 369.472709 | 0.0 |
2 | Av_Binary | 2091.356849 | 0.0 |
3 | BR | 2829.299764 | 0.0 |
4 | Bounds_Binary | 10951.699904 | 0.0 |
5 | CoreTemp | 60297.553775 | 0.0 |
6 | HR | 27672.429095 | 0.0 |
7 | HRV | 11901.972500 | 0.0 |
8 | Impact_Binary | 31276.657568 | 0.0 |
9 | ImpulseLoad | 13571.616907 | 0.0 |
10 | PeakAccelPhi | 125969.340555 | 0.0 |
11 | Posture | 4495.253069 | 0.0 |
12 | RunSteps | 7538.734647 | 0.0 |
13 | Stress_Binary | 85912.862590 | 0.0 |
14 | WalkSteps | 18894.501820 | 0.0 |
15 | peakAccelTheta | 2698.802122 | 0.0 |
0 | Acc_Binary | -1.000000 | -1.0 |