- Experience sharing
|
- platform-agnostic - understanding the concepts and not on particular software implementations
|
Data uncertainty
SELECT id, date, value
FROM my_table
UNPIVOT (value FOR date IN (' + @cols + ')) AS unpivoted'
my_table:
id date1 date2 date3
1 2022-01-01 2022-02-01 2022-03-01
2 2022-04-01 2022-05-01 2022-06-01
|
V
id date value
1 date1 2022-01-01
1 date2 2022-02-01
1 date3 2022-03-01
2 date1 2022-04-01
2 date2 2022-05-01
2 date3 2022-06-01
Process of selecting and transforming raw data into features that can be used to train machine learning models.
Most often operations
# detect outliers using z-score
z_scores = (df - df.mean()) / df.std()
threshold = 3
outliers = np.abs(z_scores) > threshold
detected_out=df[outliers]
# remove outliers from the data
clean_data = df[~outliers]
# replace outliers with mean value
mean_value = df.mean()
df[outliers] = np.tile(mean_value.values, (len(df[outliers].index), 1))
fig, ax = plt.subplots()
# plot histogram of clean data in blue
ax.hist(clean_data, bins=50, color='blue', alpha=0.5, label='Clean data')
# plot histogram of detected outliers in red
ax.hist(detected_out, bins=50, color='red', alpha=0.5, label='Detected outliers')
ax.set_xlabel('Value')
ax.set_ylabel('Frequency')
ax.set_title('Histogram of Data with Outliers Detected by Z-score')
ax.legend()
plt.show()
# detect outliers using IQR
Q1 = df['Value'].quantile(0.25)
Q3 = df['Value'].quantile(0.75)
IQR = Q3 - Q1
outliers = (df['Value']<Q1 - 1.5 * IQR) | (df['Value']>Q3 + 1.5 * IQR)
detected_out=df[outliers]
# remove outliers from the data
clean_data = df[~outliers]
# replace outliers with mean value + noise
mean_value = df.mean()
noise = np.random.normal(0, 0.1, len(df[outliers]))
mean_value_with_noise = noise + mean_value.item()
# tile mean value with noise to replace outliers
df[outliers] = pd.DataFrame(mean_value_with_noise, columns=['Value'])
# plot histogram with 50 bins
plt.hist(df['Value'], bins=50, alpha=0.5)
# plot detected outliers in red
plt.hist(detected_out['Value'], bins=50, color='red', alpha=0.5)
# set plot labels and legend
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.legend(['Clean Data', 'Detected Outliers'])
# show plot
plt.show()
import pandas as pd
import numpy as np
# create sample dataframe
data = {'A': [1, 2, 3, 4, 5],
'B': [1, 1, 1, 1, 1],
'C': [1, 1, 2, 1, 1],
'D': [1, 2, 3, 2, 1],
'E': [1, 1, 1, 1, 1]}
df = pd.DataFrame(data)
# calculate coefficient of variation for each column
cv = df.std() / df.mean()
# define threshold for coefficient of variation
threshold = 0.1
# identify low varying variables
low_var = cv < threshold
# remove low varying variables from the dataframe
clean_data = df.drop(columns=df.columns[low_var])
# print the results
print('Original data:\n', df)
print('Cleaned data:\n', clean_data)
Original data: A B C D E 0 1 1 1 1 1 1 2 1 1 2 1 2 3 1 2 3 1 3 4 1 1 2 1 4 5 1 1 1 1 Cleaned data: A C D 0 1 1 1 1 2 1 2 2 3 2 3 3 4 1 2 4 5 1 1
import pandas as pd
# create sample dataframe
data = {'Company': ['A', 'B', 'C', 'D'],
'Revenue': [100000, 50000, 75000, 125000],
'COGS': [70000, 25000, 50000, 80000],
'EBITDA': [30000, 25000, 25000, 45000]}
df = pd.DataFrame(data)
# calculate financial ratios
df['Gross Profit'] = df['Revenue'] - df['COGS']
df['Gross Margin'] = df['Gross Profit'] / df['Revenue']
df['EBITDA Margin']= df['EBITDA'] / df['Revenue']
df['Net Income'] = df['EBITDA'] - (df['COGS'] - (df['COGS'] / df['Revenue']) * df['EBITDA'])
df['Net Margin'] = df['Net Income'] / df['Revenue']
df['ROE'] = df['Net Income'] / (df['COGS'] + df['EBITDA'])
df['ROA'] = df['Net Income'] / df['Revenue']
df[['Gross Margin','EBITDA Margin', 'Net Income','Net Margin','ROE', 'ROA']] = df[['Gross Margin','EBITDA Margin', 'Net Income','Net Margin','ROE', 'ROA']].round(2)
# print the results
df
Company | Revenue | COGS | EBITDA | Gross Profit | Gross Margin | EBITDA Margin | Net Income | Net Margin | ROE | ROA | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | A | 100000 | 70000 | 30000 | 30000 | 0.30 | 0.30 | -19000.00 | -0.19 | -0.19 | -0.19 |
1 | B | 50000 | 25000 | 25000 | 25000 | 0.50 | 0.50 | 12500.00 | 0.25 | 0.25 | 0.25 |
2 | C | 75000 | 50000 | 25000 | 25000 | 0.33 | 0.33 | -8333.33 | -0.11 | -0.11 | -0.11 |
3 | D | 125000 | 80000 | 45000 | 45000 | 0.36 | 0.36 | -6200.00 | -0.05 | -0.05 | -0.05 |
Normalization
Standardization
Centering
# apply different scaling techniques
scalers = {'Min-Max': MinMaxScaler(),
'Standard': StandardScaler()}
fig, axs = plt.subplots(1, len(scalers)*2, figsize=(12, 3))
for i, (scaler_name, scaler) in enumerate(scalers.items()):
# fit and transform the data using the scaler
scaled_data = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled_data, columns=df.columns)
# plot the original data
axs[i*2].scatter(df['A'], df['B'])
axs[i*2].set_title('Original data')
# plot the scaled data
axs[i*2+1].scatter(scaled_df['A'], scaled_df['B'])
axs[i*2+1].set_title(scaler_name + ' scaling')
# apply normalization within range
range_scaler = lambda x: (x - x.min()) / (x.max() - x.min())
scaled_data = df.apply(range_scaler)
scaled_df = pd.DataFrame(scaled_data, columns=df.columns)
# plot the results
fig, axs = plt.subplots(1, 2, figsize=(6, 3))
axs[0].scatter(df['A'], df['C'])
axs[0].set_title('Original data')
axs[1].scatter(scaled_df['A'], scaled_df['C'])
axs[1].set_title('Normalization within range')
plt.show()
linear (lin) |
logarithmic (lgn) |
exponential (xpy) |
power (pow) |
---|---|---|---|
$$Y=a_0+a_1.X$$ | $$Y=a_0+a_1.\ln{X}$$ $$Z=\ln{X}$$ $$Y=a_0+a_1.Z$$ | $$Y=e^{a_0+a_1.X}$$ $$Q=\ln{Y}$$ $$Q=a_0+a_1.X$$ | $$Y=a_0.X^{a_1}$$ $$Z=\ln{X}$$ $$Q=\ln{Y}$$ $$Q=a_0+a_1.Z$$ |
reciprocal (rex) |
reverse reciprocal (rey) |
quadratic sum (sqr) |
sine (snx) |
$$Y=a_0+\frac{a_1}{X}$$ $$Z=\frac{1}{𝑋}, {X}\neq{0}$$ $$Y=a_0+a_1.Z$$ | $$Y=\frac{1}{a_0+a_1.X}$$ $$Q=\frac{1}{Y}, {Y}\neq{0}$$ $$Q=a_0+a_1.X$$ | $$Y={(a_0+a_1.X)^2}$$ $$Q=\sqrt{Y}$$ $$Q=|a_0+a_1.X|$$ | $${Y=a_0+a_1.\sin{X}}$$ $$Z=\sin{X}$$ $$Y=a_0+a_1.Z$$ |
# Calculate the natural logarithm of the feature
data['Log_Feature'] = np.log(data['Feature'])
# Calculate the reciprocal of the feature
data['Reciprocal_Feature'] = 1 / data['Feature']
# Calculate the square root of the feature
data['Sqrt_Feature'] = np.sqrt(data['Feature'])
# Calculate the sine of the feature
data['Sin_Feature'] = np.sin(data['Feature'])
# Create a scatter plot of all features
fig, ax = plt.subplots()
ax.plot(data.index, data['Feature'], label='Feature')
ax.plot(data.index, data['Log_Feature'], label='Log_Feature')
ax.plot(data.index, data['Reciprocal_Feature'], label='Reciprocal_Feature')
ax.plot(data.index, data['Sqrt_Feature'], label='Sqrt_Feature')
ax.plot(data.index, data['Sin_Feature'], label='Sin_Feature')
ax.legend()
plt.show()
Convert categorical features into numerical features
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
# create sample dataframe
data = {'color': ['red', 'blue', 'green', 'red', 'green', 'blue'],
'size': ['small', 'medium', 'small', 'large', 'medium', 'large']}
df = pd.DataFrame(data)
# one-hot encode the categorical variables
onehot_encoder = OneHotEncoder()
onehot_encoded = onehot_encoder.fit_transform(df[['color', 'size']])
onehot_encoded = onehot_encoded.astype(int) # convert to integers
onehot_df = pd.DataFrame(onehot_encoded.toarray(), columns=onehot_encoder.get_feature_names_out(['color', 'size']))
# concatenate the one-hot encoded data to the original dataframe
df = pd.concat([df, onehot_df], axis=1)
# drop the original categorical variables
df.drop(['color', 'size'], axis=1, inplace=True)
# drop redundant ctegories
# df.drop(['color_red', 'size_small'], axis=1, inplace=True)
df
color_blue | color_green | color_red | size_large | size_medium | size_small | |
---|---|---|---|---|---|---|
0 | 0 | 0 | 1 | 0 | 0 | 1 |
1 | 1 | 0 | 0 | 0 | 1 | 0 |
2 | 0 | 1 | 0 | 0 | 0 | 1 |
3 | 0 | 0 | 1 | 1 | 0 | 0 |
4 | 0 | 1 | 0 | 0 | 1 | 0 |
5 | 1 | 0 | 0 | 1 | 0 | 0 |
# calculate mean price for each color
color_mean = df.groupby('color')['price'].mean()
# create new column with encoded values
df['color_mean'] = df['color'].map(color_mean)
# drop the original categorical variable
df.drop(['color'], axis=1, inplace=True)
print(df)
size price color_mean 0 small 10 17.5 1 medium 20 21.0 2 small 15 16.5 3 large 25 17.5 4 medium 18 16.5 5 large 22 21.0
# calculate total counts and target counts for each category
total_counts = df.groupby('color')['target'].count()
target_counts = df.groupby('color')['target'].sum()
# calculate percentages and WoE values for each category
total_perc = total_counts / total_counts.sum()
target_perc = target_counts / target_counts.sum()
nontarget_perc = (total_counts - target_counts) / (total_counts.sum() - target_counts.sum())
woe_values = np.log(target_perc / nontarget_perc)
print(df)
size target color_woe 0 small 10 0.002018 1 medium 20 0.020861 2 small 31 -0.013730 3 large 22 0.002018 4 medium 11 -0.013730 5 large 5 0.020861
Convert numerical/categorical features into categorical features
Unsupervised
Supervised
# unsupervised binning using equal width intervals
num_bins = 3
bin_labels = ['low', 'medium', 'high']
df['age_bins_unsupervised'] = pd.cut(df['age'], num_bins, labels=bin_labels)
# supervised binning using decision tree
tree = DecisionTreeClassifier(max_depth=1)
y = df['income']
X = df[df.columns.difference(['income','age_bins_unsupervised'])]
tree.fit(X, y)
df['age_bins_supervised'] = tree.predict(X)
ct = pd.crosstab(df['age_bins_supervised'], df['age_bins_unsupervised'])
ct
age_bins_unsupervised | low | medium | high |
---|---|---|---|
age_bins_supervised | |||
28800 | 14 | 50 | 0 |
53039 | 0 | 8 | 28 |
Most often operations
Variables with a time delay compared to the others. Variable shifted in time.
Methods
shift function in pandas
Henkel matrix - Strongly recommended universal method
# create a lagged variable with a time shift of 1 day
df['lagged'] = df['value'].shift(1)
print(df)
value lagged 0 1 NaN 1 2 1.0 2 3 2.0 3 4 3.0 4 5 4.0
import numpy as np
# Generate random time series data with 20 observations
data = np.random.rand(20)
# Define the maximum lag we want to include in our lagged features
max_lag = 5
# Create a Henkel matrix with lagged features
henkel_matrix = np.zeros((len(data), max_lag+1))
for i in range(max_lag+1):
henkel_matrix[i:len(data), i] = data[0:len(data)-i]
henkel_matrix=henkel_matrix.round(3)
# Print the Henkel matrix
print(henkel_matrix)
[[0.74 0. 0. 0. 0. 0. ] [0.497 0.74 0. 0. 0. 0. ] [0.586 0.497 0.74 0. 0. 0. ] [0.061 0.586 0.497 0.74 0. 0. ] [0.617 0.061 0.586 0.497 0.74 0. ] [0.657 0.617 0.061 0.586 0.497 0.74 ] [0.859 0.657 0.617 0.061 0.586 0.497] [0.569 0.859 0.657 0.617 0.061 0.586] [0.905 0.569 0.859 0.657 0.617 0.061] [0.834 0.905 0.569 0.859 0.657 0.617] [0.568 0.834 0.905 0.569 0.859 0.657] [0.847 0.568 0.834 0.905 0.569 0.859] [0.026 0.847 0.568 0.834 0.905 0.569] [0.818 0.026 0.847 0.568 0.834 0.905] [0.961 0.818 0.026 0.847 0.568 0.834] [0.207 0.961 0.818 0.026 0.847 0.568] [0.57 0.207 0.961 0.818 0.026 0.847] [0.954 0.57 0.207 0.961 0.818 0.026] [0.237 0.954 0.57 0.207 0.961 0.818] [0.474 0.237 0.954 0.57 0.207 0.961]]
Sample windows
Method
# Define the window size for the rolling statistics
window_size = 3
# Calculate rolling mean, standard deviation, and maximum
rolling_mean = series.rolling(window_size).mean()
rolling_std = series.rolling(window_size).std()
rolling_max = series.rolling(window_size).max()
df
Original data | Rolling mean | Rolling standard deviation | Rolling maximum | |
---|---|---|---|---|
0 | 0.076313 | NaN | NaN | NaN |
1 | 0.264040 | NaN | NaN | NaN |
2 | 0.675782 | 0.338712 | 0.306631 | 0.675782 |
3 | 0.068876 | 0.336233 | 0.309826 | 0.675782 |
4 | 0.806467 | 0.517042 | 0.393585 | 0.806467 |
5 | 0.705469 | 0.526937 | 0.399894 | 0.806467 |
6 | 0.756620 | 0.756185 | 0.050500 | 0.806467 |
7 | 0.018057 | 0.493382 | 0.412437 | 0.756620 |
8 | 0.089027 | 0.287901 | 0.407471 | 0.756620 |
9 | 0.579511 | 0.228865 | 0.305734 | 0.579511 |
10 | 0.527292 | 0.398610 | 0.269375 | 0.579511 |
11 | 0.970188 | 0.692330 | 0.242044 | 0.970188 |
12 | 0.485930 | 0.661137 | 0.268444 | 0.970188 |
13 | 0.957106 | 0.804408 | 0.275888 | 0.970188 |
14 | 0.128065 | 0.523700 | 0.415809 | 0.957106 |
15 | 0.372937 | 0.486036 | 0.425935 | 0.957106 |
16 | 0.881238 | 0.460746 | 0.384188 | 0.881238 |
17 | 0.867801 | 0.707325 | 0.289667 | 0.881238 |
18 | 0.971188 | 0.906742 | 0.056215 | 0.971188 |
19 | 0.874550 | 0.904513 | 0.057841 | 0.971188 |
Re-scaling
import pandas as pd
# create a DataFrame with a datetime index
date_rng = pd.date_range(start='1/1/2020', end='1/20/2020', freq='D')
df = pd.DataFrame(date_rng, columns=['date'])
df['data'] = np.random.randint(0,100,size=(len(date_rng)))
# change the frequency to weekly and take the mean of each group
df = df.set_index('date')
weekly_df = df.resample('W').mean()
weekly_df
data | |
---|---|
date | |
2020-01-05 | 59.600000 |
2020-01-12 | 70.857143 |
2020-01-19 | 42.857143 |
2020-01-26 | 95.000000 |
Re-framing
# fill in the missing dates with NaN values
df = df.set_index('date')
df_new = df.asfreq('D')
df_new
data | |
---|---|
date | |
2020-01-01 | 54.0 |
2020-01-02 | 67.0 |
2020-01-03 | 42.0 |
2020-01-04 | NaN |
2020-01-05 | 60.0 |
2020-01-06 | 22.0 |
2020-01-07 | 99.0 |
Extracting datetime features
# Convert the data to a Pandas Series with DatetimeIndex
series = pd.Series(data, index=date_range)
# Extract calendar and time base features from the index
year = series.index.year
month = series.index.month
day = series.index.day
hour = series.index.hour
minute = series.index.minute
df
Date | Data | Year | Month | Day | Hour | Minute | |
---|---|---|---|---|---|---|---|
0 | 2022-01-01 00:00:00 | 0.114295 | 2022 | 1 | 1 | 0 | 0 |
1 | 2022-01-01 01:00:00 | 0.499400 | 2022 | 1 | 1 | 1 | 0 |
2 | 2022-01-01 02:00:00 | 0.316746 | 2022 | 1 | 1 | 2 | 0 |
3 | 2022-01-01 03:00:00 | 0.901192 | 2022 | 1 | 1 | 3 | 0 |
4 | 2022-01-01 04:00:00 | 0.531030 | 2022 | 1 | 1 | 4 | 0 |
5 | 2022-01-01 05:00:00 | 0.792617 | 2022 | 1 | 1 | 5 | 0 |
6 | 2022-01-01 06:00:00 | 0.100412 | 2022 | 1 | 1 | 6 | 0 |
7 | 2022-01-01 07:00:00 | 0.187317 | 2022 | 1 | 1 | 7 | 0 |
8 | 2022-01-01 08:00:00 | 0.786790 | 2022 | 1 | 1 | 8 | 0 |
9 | 2022-01-01 09:00:00 | 0.497147 | 2022 | 1 | 1 | 9 | 0 |
10 | 2022-01-01 10:00:00 | 0.138009 | 2022 | 1 | 1 | 10 | 0 |
11 | 2022-01-01 11:00:00 | 0.681217 | 2022 | 1 | 1 | 11 | 0 |
12 | 2022-01-01 12:00:00 | 0.251439 | 2022 | 1 | 1 | 12 | 0 |
13 | 2022-01-01 13:00:00 | 0.518483 | 2022 | 1 | 1 | 13 | 0 |
14 | 2022-01-01 14:00:00 | 0.559895 | 2022 | 1 | 1 | 14 | 0 |
15 | 2022-01-01 15:00:00 | 0.542000 | 2022 | 1 | 1 | 15 | 0 |
16 | 2022-01-01 16:00:00 | 0.265538 | 2022 | 1 | 1 | 16 | 0 |
17 | 2022-01-01 17:00:00 | 0.068318 | 2022 | 1 | 1 | 17 | 0 |
18 | 2022-01-01 18:00:00 | 0.548399 | 2022 | 1 | 1 | 18 | 0 |
19 | 2022-01-01 19:00:00 | 0.909004 | 2022 | 1 | 1 | 19 | 0 |
# Convert the data to a Pandas Series
series = pd.Series(data)
# Calculate the first and third quartiles
q1 = series.quantile(0.25)
q3 = series.quantile(0.75)
# Define the filter based on the interquartile range (IQR)
iqr = q3 - q1
filter = (series >= q1 - 1.5*iqr) & (series <= q3 + 1.5*iqr)
# Filter the data
filtered_data = series[filter]
# Plot the original data and the filtered data
fig, ax = plt.subplots()
ax.plot(series.index, series.values, label='Original data')
ax.plot(filtered_data.index, filtered_data.values, label='Filtered data')
ax.set_xlabel('Index')
ax.set_ylabel('Value')
ax.legend()
plt.show()
Extract seasonality from a time series, decomposing them into its trend, seasonal, and residual components.
Fourier
display(iframe)
# Calculate the Fourier coefficients for each harmonic separately
num_harmonics = 3
all_coeffs = np.fft.fft(series)
coeffs = []
for i in range(1, num_harmonics+1):
coeffs.append(np.zeros(len(all_coeffs), dtype=complex))
coeffs[-1][i] = all_coeffs[i]
coeffs[-1][-i] = all_coeffs[-i]
# Reconstruct the signal using the first 3 harmonics
reconstructed_coeffs = np.zeros(len(all_coeffs), dtype=complex)
for i in range(num_harmonics):
reconstructed_coeffs += coeffs[i]
reconstructed_signal = np.fft.ifft(reconstructed_coeffs).real
reconstructed_signal += series.mean()
# Plot the original signal and the reconstructed signals for each harmonic
fig, ax = plt.subplots()
ax.plot(series.index, series.values, label='Original signal')
for i in range(num_harmonics):
smoothed_signal = pd.Series(np.fft.ifft(coeffs[i]).real, index=series.index).rolling(window=5, center=True).mean()
ax.plot(series.index, smoothed_signal.values, label='Harmonic ' + str(i+1))
ax.plot(series.index, reconstructed_signal, label='Reconstructed signal', linewidth=2, linestyle='--')
ax.set_xlabel('Index')
ax.set_ylabel('Value')
ax.legend()
<matplotlib.legend.Legend at 0x7f6036d384f0>
Seasonality analysis
# Perform the decomposition
decomposition = sm.tsa.seasonal_decompose(series, model='additive', period=4)
fig=decomposition.plot();
fig.set_size_inches((8, 3.5));
fig.tight_layout();
Selecting the most relevant features for the model, which can improve performance and reduce the risk of overfitting
X,X Correlation
def hyperoptimize(corr_matrix, exclude_features):
n_features = corr_matrix.shape[0]
min_high_corr = float('inf')
best_n_reduce = 0
best_exclude = exclude_features.copy()
for n_reduce in range(1, n_features):
for k in range(n_features):
for exclude in itertools.combinations(exclude_features, k):
exclude = list(exclude)
reduce_features = [i for i in range(n_features) if i not in exclude and i != n_reduce]
reduced_corr_matrix = corr_matrix.iloc[reduce_features, reduce_features]
n_high_corr = (reduced_corr_matrix.abs() > 0.5).sum().sum()
if n_high_corr < min_high_corr:
min_high_corr = n_high_corr
best_n_reduce = n_reduce
best_exclude = exclude
return best_n_reduce, best_exclude
# Compute the correlation matrix
corr_matrix = df.corr()
# Store the initial correlation matrix
corr_matrix_old = df.corr()
# Set the threshold for the number of high-correlation values
threshold = 7
# Set the initial list of excluded features
exclude_features = []
# Iterate until the number of high-correlation values is below the threshold
while True:
# Hyperoptimize the number of features to reduce and the excluded features
n_reduce, exclude_features = hyperoptimize(corr_matrix, exclude_features)
# Reduce the specified features
reduce_features = [i for i in range(corr_matrix.shape[0]) if i not in exclude_features and i != n_reduce]
corr_matrix = corr_matrix.iloc[reduce_features, reduce_features]
# Count the number of high-correlation values
n_high_corr = (corr_matrix.abs() > 0.5).sum().sum()
# If the number of high-correlation values is below the threshold, break the loop
if n_high_corr <= threshold:
break
# Create a figure with two axes side by side
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(10, 4))
# Plot the first heatmap on the left axis
sns.heatmap(corr_matrix_old, annot=True, cmap='coolwarm', center=0, square=True, ax=ax1)
ax1.set_title('Old correlation matrix')
# Plot the second heatmap on the right axis
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, square=True, ax=ax2)
ax2.set_title('Updated correlation matrix')
# Show the plot
plt.show()
X,Y Correlation
# Compute the correlation coefficients between the dependent variable and the independent variables
corr_matrix = df.corr().iloc[-1, :-1]
# Filter out the independent variables with low correlation coefficients
selected_vars = corr_matrix[corr_matrix.abs() >= 0.5].index
print(corr_df)
var corr selected 0 Indep1 0.256760 no 1 Indep2 0.047934 no 2 Indep3 0.922601 yes 3 Indep4 0.147780 no 4 Indep5 0.891848 yes 5 Indep6 0.006403 no 6 Indep7 0.781090 yes 7 Indep8 0.066287 no 8 Indep9 0.013465 no 9 Indep10 -0.076730 no
Two or more predictor variables in a regression model are highly correlated, meaning they are linearly dependent on each other
# display the IFrame object in the notebook output
display(iframe)
Variance Inflation Factor (VIF)
# Compute the variance inflation factor for each variable
vif = pd.Series([variance_inflation_factor(df.values, i) for i in range(df.shape[1])],
index=df.columns)
# Select the variables with VIF less than 5
selected_vars = vif[vif < 5].index
# Print the VIF and the selected variables
print('Variance Inflation Factor:')
print(vif)
print('Selected variables:')
print(selected_vars)
Variance Inflation Factor: Var1 39.607395 Var2 27.014129 Var3 31.176229 Var4 4.314400 Var5 32.245597 Var6 4.181100 Var7 40.196595 Var8 3.633429 Var9 4.434751 Var10 3.954685 dtype: float64 Selected variables: Index(['Var4', 'Var6', 'Var8', 'Var9', 'Var10'], dtype='object')
Regularization is a technique to prevent overfitting and improve the performance of predictive models by adding a penalty term to the model.
Least Absolute Shrinkage and Selection Operator adds a penalty term to the sum of squared errors (SSE) in the linear regression equation, using a tuning parameter to control the amount of regularization.
where:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2, random_state=42)
# Create a Lasso regression object with alpha=0.5
lasso_reg = Lasso(alpha=0.5)
# Fit the Lasso regression model to the training data
lasso_reg.fit(X_train, y_train)
# Use the trained model to make predictions on the testing data
y_pred = lasso_reg.predict(X_test)
# Calculate the mean squared error of the predictions
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: ", mse)
df.round(2)
Mean Squared Error: 0.7263312822033787
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | icpt | |
---|---|---|---|---|---|---|---|---|---|
0 | 0.29 | 0.01 | 0.0 | -0.0 | 0.0 | -0.0 | -0.0 | -0.0 | 0.59 |
plot_lasso_coefs()
plot_lasso_mse()
Ridge regression adds a penalty term - proportional to the squared values of the model parameters.
where
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2, random_state=42)
# Create a Ridge regression object with alpha=0.5
ridge_reg = Ridge(alpha=0.5)
# Fit the Ridge regression model to the training data
ridge_reg.fit(X_train, y_train)
# Use the trained model to make predictions on the testing data
y_pred = ridge_reg.predict(X_test)
# Calculate the mean squared error of the predictions
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: ", mse)
df.round(2)
Mean Squared Error: 0.5635171842142302
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | icpt | |
---|---|---|---|---|---|---|---|---|---|
0 | 0.13 | 0.02 | 0.03 | -0.01 | -0.0 | -0.01 | -0.03 | -0.01 | 2.07 |
plot_ridge_coefs()
plot_ridge_mse()
Elastic net is a regularization combining the L1 and L2 regularization techniques used in Lasso and Ridge regression.
where:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2, random_state=42)
# Create an Elastic Net object with alpha=0.5, l1_ratio=0.5
enet = ElasticNet(alpha=0.5, l1_ratio=0.5)
# Fit the Elastic Net model to the training data
enet.fit(X_train, y_train)
# Use the trained model to make predictions on the testing data
y_pred = enet.predict(X_test)
# Calculate the mean squared error of the predictions
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: ", mse)
df.round(2)
Mean Squared Error: 0.6868730783041608
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | icpt | |
---|---|---|---|---|---|---|---|---|---|
0 | 0.34 | 0.01 | -0.0 | 0.0 | 0.0 | -0.0 | -0.0 | -0.0 | 0.31 |
plot_enet_coef_alpha()
plot_enet_mse_alpha()
plot_enet_coef_l1ratio()
plot_enet_mse_l1ratio()
Feature extraction: This involves transforming raw data into a new set of features that better capture the underlying patterns in the data. This can include techniques such as principal component analysis (PCA), singular value decomposition (SVD), Linear Discriminant Analysis (LDA), etc.