import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# generate some random data
np.random.seed(42)
data = np.random.normal(0, 1, 100)

# introduce 15 outliers with value > 3
num_outliers = 15
outlier_indices = np.random.choice(100, num_outliers, replace=False)
outliers_gen = np.random.normal(0, 1, num_outliers) + np.random.normal(3, 1, num_outliers)
data[outlier_indices] = outliers_gen

# convert data to a Pandas dataframe
df = pd.DataFrame(data, columns=['Value'])


# detect outliers using z-score
z_scores = (df - df.mean()) / df.std()
threshold = 3
outliers = np.abs(z_scores) > threshold
detected_out=df[outliers]

# remove outliers from the data
clean_data = df[~outliers]

# replace outliers with mean value
mean_value = df.mean()
df[outliers] = np.tile(mean_value.values, (len(df[outliers].index), 1))


fig, ax = plt.subplots()
# plot histogram of clean data in blue
ax.hist(clean_data, bins=50, color='blue', alpha=0.5, label='Clean data')
# plot histogram of detected outliers in red
ax.hist(detected_out, bins=50, color='red', alpha=0.5, label='Detected outliers')
ax.set_xlabel('Value')
ax.set_ylabel('Frequency')
ax.set_title('Histogram of Data with Outliers Detected by Z-score')
ax.legend()
plt.show()


%reset -f
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# generate some random data
np.random.seed(42)
data = np.random.normal(0, 0.5, 100)

# introduce 15 outliers with value > |2|
num_outliers = 15
outlier_indices = np.random.choice(100, num_outliers, replace=False)
outliers_gen = np.random.normal(1.5, 0.5, num_outliers)
for i in range(num_outliers//2):
    outliers_gen[i] *= -1
data[outlier_indices] = outliers_gen

# convert data to a Pandas dataframe
df = pd.DataFrame(data, columns=['Value'])


# detect outliers using IQR
Q1 = df['Value'].quantile(0.25)
Q3 = df['Value'].quantile(0.75)
IQR = Q3 - Q1
outliers = (df['Value']<Q1 - 1.5 * IQR) | (df['Value']>Q3 + 1.5 * IQR)
detected_out=df[outliers]

# remove outliers from the data
clean_data = df[~outliers]

# replace outliers with mean value + noise
mean_value = df.mean()
noise = np.random.normal(0, 0.1, len(df[outliers]))
mean_value_with_noise = noise + mean_value.item()

# tile mean value with noise to replace outliers
df[outliers] = pd.DataFrame(mean_value_with_noise, columns=['Value'])


# plot histogram with 50 bins
plt.hist(df['Value'], bins=50, alpha=0.5)
# plot detected outliers in red
plt.hist(detected_out['Value'], bins=50, color='red', alpha=0.5)
# set plot labels and legend
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.legend(['Clean Data', 'Detected Outliers'])
# show plot
plt.show()


import pandas as pd
import numpy as np
# create sample dataframe
data = {'A': [1, 2, 3, 4, 5],
        'B': [1, 1, 1, 1, 1],
        'C': [1, 1, 2, 1, 1],
        'D': [1, 2, 3, 2, 1],
        'E': [1, 1, 1, 1, 1]}
df = pd.DataFrame(data)

# calculate coefficient of variation for each column
cv = df.std() / df.mean()
# define threshold for coefficient of variation
threshold = 0.1

# identify low varying variables
low_var = cv < threshold
# remove low varying variables from the dataframe
clean_data = df.drop(columns=df.columns[low_var])


# print the results
print('Original data:\n', df)
print('Cleaned data:\n', clean_data)

Original data:
    A  B  C  D  E
0  1  1  1  1  1
1  2  1  1  2  1
2  3  1  2  3  1
3  4  1  1  2  1
4  5  1  1  1  1
Cleaned data:
    A  C  D
0  1  1  1
1  2  1  2
2  3  2  3
3  4  1  2
4  5  1  1


import pandas as pd

# create sample dataframe
data = {'Company': ['A', 'B', 'C', 'D'],
        'Revenue': [100000, 50000, 75000, 125000],
        'COGS': [70000, 25000, 50000, 80000],
        'EBITDA': [30000, 25000, 25000, 45000]}
df = pd.DataFrame(data)

# calculate financial ratios
df['Gross Profit'] = df['Revenue'] - df['COGS']
df['Gross Margin'] = df['Gross Profit'] / df['Revenue']
df['EBITDA Margin']= df['EBITDA'] / df['Revenue']
df['Net Income'] = df['EBITDA'] - (df['COGS'] - (df['COGS'] / df['Revenue']) * df['EBITDA'])
df['Net Margin'] = df['Net Income'] / df['Revenue']
df['ROE'] = df['Net Income'] / (df['COGS'] + df['EBITDA'])
df['ROA'] = df['Net Income'] / df['Revenue']

df[['Gross Margin','EBITDA Margin', 'Net Income','Net Margin','ROE', 'ROA']] = df[['Gross Margin','EBITDA Margin', 'Net Income','Net Margin','ROE', 'ROA']].round(2)


# print the results
df


import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# create sample dataframe
data = {'A': [1, 2, 3, 4, 5],
        'B': [100, 200, 300, 400, 500],
        'C': [1.0, 2.0, 2.5, 3.0, 4.0]}
df = pd.DataFrame(data)


# apply different scaling techniques
scalers = {'Min-Max': MinMaxScaler(),
           'Standard': StandardScaler()}


fig, axs = plt.subplots(1, len(scalers)*2, figsize=(12, 3))

for i, (scaler_name, scaler) in enumerate(scalers.items()):
    # fit and transform the data using the scaler
    scaled_data = scaler.fit_transform(df)
    scaled_df = pd.DataFrame(scaled_data, columns=df.columns)

    # plot the original data
    axs[i*2].scatter(df['A'], df['B'])
    axs[i*2].set_title('Original data')

    # plot the scaled data
    axs[i*2+1].scatter(scaled_df['A'], scaled_df['B'])
    axs[i*2+1].set_title(scaler_name + ' scaling')


# apply normalization within range
range_scaler = lambda x: (x - x.min()) / (x.max() - x.min())
scaled_data = df.apply(range_scaler)
scaled_df = pd.DataFrame(scaled_data, columns=df.columns)

# plot the results
fig, axs = plt.subplots(1, 2, figsize=(6, 3))
axs[0].scatter(df['A'], df['C'])
axs[0].set_title('Original data')
axs[1].scatter(scaled_df['A'], scaled_df['C'])
axs[1].set_title('Normalization within range')
plt.show()


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Generate a random dataset of 20 floats
np.random.seed(42)
data = pd.DataFrame(np.random.rand(20)*10, columns=['Feature'])


# Calculate the natural logarithm of the feature
data['Log_Feature'] = np.log(data['Feature'])

# Calculate the reciprocal of the feature
data['Reciprocal_Feature'] = 1 / data['Feature']

# Calculate the square root of the feature
data['Sqrt_Feature'] = np.sqrt(data['Feature'])

# Calculate the sine of the feature
data['Sin_Feature'] = np.sin(data['Feature'])


# Create a scatter plot of all features
fig, ax = plt.subplots()
ax.plot(data.index, data['Feature'], label='Feature')
ax.plot(data.index, data['Log_Feature'], label='Log_Feature')
ax.plot(data.index, data['Reciprocal_Feature'], label='Reciprocal_Feature')
ax.plot(data.index, data['Sqrt_Feature'], label='Sqrt_Feature')
ax.plot(data.index, data['Sin_Feature'], label='Sin_Feature')
ax.legend()
plt.show()


import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# create sample dataframe
data = {'color': ['red', 'blue', 'green', 'red', 'green', 'blue'],
        'size': ['small', 'medium', 'small', 'large', 'medium', 'large']}
df = pd.DataFrame(data)


# one-hot encode the categorical variables
onehot_encoder = OneHotEncoder()
onehot_encoded = onehot_encoder.fit_transform(df[['color', 'size']])
onehot_encoded = onehot_encoded.astype(int)  # convert to integers
onehot_df = pd.DataFrame(onehot_encoded.toarray(), columns=onehot_encoder.get_feature_names_out(['color', 'size']))

# concatenate the one-hot encoded data to the original dataframe
df = pd.concat([df, onehot_df], axis=1)


# drop the original categorical variables
df.drop(['color', 'size'], axis=1, inplace=True)
# drop redundant ctegories
# df.drop(['color_red', 'size_small'], axis=1, inplace=True)
df


import pandas as pd

# create sample dataframe
data = {'color': ['red', 'blue', 'green', 'red', 'green', 'blue'],
        'size': ['small', 'medium', 'small', 'large', 'medium', 'large'],
        'price': [10, 20, 15, 25, 18, 22]}
df = pd.DataFrame(data)


# calculate mean price for each color
color_mean = df.groupby('color')['price'].mean()

# create new column with encoded values
df['color_mean'] = df['color'].map(color_mean)

# drop the original categorical variable
df.drop(['color'], axis=1, inplace=True)

print(df)

     size  price  color_mean
0   small     10        17.5
1  medium     20        21.0
2   small     15        16.5
3   large     25        17.5
4  medium     18        16.5
5   large     22        21.0


import pandas as pd
import numpy as np

# create sample dataframe
data = {'color': ['red', 'blue', 'green', 'red', 'green', 'blue'],
        'size': ['small', 'medium', 'small', 'large', 'medium', 'large'],
        'target': [10, 20, 31, 22, 11, 5]}
df = pd.DataFrame(data)


# calculate total counts and target counts for each category
total_counts = df.groupby('color')['target'].count()
target_counts = df.groupby('color')['target'].sum()
# calculate percentages and WoE values for each category
total_perc = total_counts / total_counts.sum()
target_perc = target_counts / target_counts.sum()
nontarget_perc = (total_counts - target_counts) / (total_counts.sum() - target_counts.sum())
woe_values = np.log(target_perc / nontarget_perc)


# create new column with WoE values
df['color_woe'] = df['color'].map(woe_values)

# drop the original categorical variable
df.drop(['color'], axis=1, inplace=True)


print(df)

     size  target  color_woe
0   small      10   0.002018
1  medium      20   0.020861
2   small      31  -0.013730
3   large      22   0.002018
4  medium      11  -0.013730
5   large       5   0.020861


import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier

# create sample dataframe with 100 observations
np.random.seed(40)
age = np.random.normal(loc=50, scale=15, size=100)
income = np.random.normal(loc=50000, scale=10000, size=100)
education = np.random.randint(1,4, size=100)
gender = np.random.randint(1,3, size=100)
df = pd.DataFrame({'age': age.astype(int),
                   'income': income.astype(int),
                   'education': education,
                   'gender': gender
                  })


# unsupervised binning using equal width intervals
num_bins = 3
bin_labels = ['low', 'medium', 'high']
df['age_bins_unsupervised'] = pd.cut(df['age'], num_bins, labels=bin_labels)

# supervised binning using decision tree
tree = DecisionTreeClassifier(max_depth=1)
y = df['income']
X = df[df.columns.difference(['income','age_bins_unsupervised'])]
tree.fit(X, y)
df['age_bins_supervised'] = tree.predict(X)

ct = pd.crosstab(df['age_bins_supervised'], df['age_bins_unsupervised'])
ct


import pandas as pd

# create a sample dataframe
data = {'value': [1, 2, 3, 4, 5]}
df = pd.DataFrame(data)


# create a lagged variable with a time shift of 1 day
df['lagged'] = df['value'].shift(1)

print(df)

   value  lagged
0      1     NaN
1      2     1.0
2      3     2.0
3      4     3.0
4      5     4.0


import numpy as np

# Generate random time series data with 20 observations
data = np.random.rand(20)

# Define the maximum lag we want to include in our lagged features
max_lag = 5

# Create a Henkel matrix with lagged features
henkel_matrix = np.zeros((len(data), max_lag+1))

for i in range(max_lag+1):
    henkel_matrix[i:len(data), i] = data[0:len(data)-i]
henkel_matrix=henkel_matrix.round(3)


# Print the Henkel matrix
print(henkel_matrix)

[[0.74  0.    0.    0.    0.    0.   ]
 [0.497 0.74  0.    0.    0.    0.   ]
 [0.586 0.497 0.74  0.    0.    0.   ]
 [0.061 0.586 0.497 0.74  0.    0.   ]
 [0.617 0.061 0.586 0.497 0.74  0.   ]
 [0.657 0.617 0.061 0.586 0.497 0.74 ]
 [0.859 0.657 0.617 0.061 0.586 0.497]
 [0.569 0.859 0.657 0.617 0.061 0.586]
 [0.905 0.569 0.859 0.657 0.617 0.061]
 [0.834 0.905 0.569 0.859 0.657 0.617]
 [0.568 0.834 0.905 0.569 0.859 0.657]
 [0.847 0.568 0.834 0.905 0.569 0.859]
 [0.026 0.847 0.568 0.834 0.905 0.569]
 [0.818 0.026 0.847 0.568 0.834 0.905]
 [0.961 0.818 0.026 0.847 0.568 0.834]
 [0.207 0.961 0.818 0.026 0.847 0.568]
 [0.57  0.207 0.961 0.818 0.026 0.847]
 [0.954 0.57  0.207 0.961 0.818 0.026]
 [0.237 0.954 0.57  0.207 0.961 0.818]
 [0.474 0.237 0.954 0.57  0.207 0.961]]


import numpy as np
import pandas as pd

# Generate random time series data with 20 observations
data = np.random.rand(20)

# Convert the data to a Pandas Series
series = pd.Series(data)


# Define the window size for the rolling statistics
window_size = 3
# Calculate rolling mean, standard deviation, and maximum
rolling_mean = series.rolling(window_size).mean()
rolling_std = series.rolling(window_size).std()
rolling_max = series.rolling(window_size).max()


# Print the rolling statistics
df=pd.DataFrame({"Original data": data,
                 "Rolling mean": rolling_mean.tolist(),
                 "Rolling standard deviation": rolling_std.tolist(),
                 "Rolling maximum": rolling_max.tolist()
                })

df


import pandas as pd

# create a DataFrame with a datetime index
date_rng = pd.date_range(start='1/1/2020', end='1/20/2020', freq='D')
df = pd.DataFrame(date_rng, columns=['date'])
df['data'] = np.random.randint(0,100,size=(len(date_rng)))

# change the frequency to weekly and take the mean of each group
df = df.set_index('date')
weekly_df = df.resample('W').mean()
weekly_df


import pandas as pd
import numpy as np

# create a DataFrame with a datetime index and missing dates
date_rng = pd.date_range(start='1/1/2020', end='1/7/2020', freq='D')
date_rng = date_rng.drop(date_rng[3])
df = pd.DataFrame(date_rng, columns=['date'])
df['data'] = np.random.randint(0,100,size=(len(date_rng)))


# fill in the missing dates with NaN values
df = df.set_index('date')
df_new = df.asfreq('D')
df_new


import numpy as np
import pandas as pd

# Generate random time series data with 20 observations
data = np.random.rand(20)

# Generate a DatetimeIndex with hourly frequency starting from 2022-01-01
date_range = pd.date_range(start='2022-01-01', periods=len(data), freq='H')


# Convert the data to a Pandas Series with DatetimeIndex
series = pd.Series(data, index=date_range)

# Extract calendar and time base features from the index
year = series.index.year
month = series.index.month
day = series.index.day
hour = series.index.hour
minute = series.index.minute


# Print the calendar and time base features
df=pd.DataFrame({"Date":date_range,
                 "Data": data,
                 "Year": year,
                 "Month": month,
                 "Day": day,
                 "Hour": hour,
                 "Minute": minute
                })

df


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Generate random time series data with 20 observations and 3 outliers
data = np.random.rand(20)
data[3] = -2
data[10] = 3.5
data[17] = 2.6


# Convert the data to a Pandas Series
series = pd.Series(data)

# Calculate the first and third quartiles
q1 = series.quantile(0.25)
q3 = series.quantile(0.75)

# Define the filter based on the interquartile range (IQR)
iqr = q3 - q1
filter = (series >= q1 - 1.5*iqr) & (series <= q3 + 1.5*iqr)

# Filter the data
filtered_data = series[filter]


# Plot the original data and the filtered data
fig, ax = plt.subplots()
ax.plot(series.index, series.values, label='Original data')
ax.plot(filtered_data.index, filtered_data.values, label='Filtered data')
ax.set_xlabel('Index')
ax.set_ylabel('Value')
ax.legend()
plt.show()


from IPython.display import IFrame

# define the URL of the webpage to embed
url = "https://www.jezzamon.com/fourier/"

# create an IFrame object with the specified URL and dimensions
iframe = IFrame(url, width=900, height=500)


display(iframe)


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Generate random time series data with 20 observations
data = np.random.rand(20)

# Convert the data to a Pandas Series
series = pd.Series(data)


# Calculate the Fourier coefficients for each harmonic separately
num_harmonics = 3
all_coeffs = np.fft.fft(series)
coeffs = []
for i in range(1, num_harmonics+1):
    coeffs.append(np.zeros(len(all_coeffs), dtype=complex))
    coeffs[-1][i] = all_coeffs[i]
    coeffs[-1][-i] = all_coeffs[-i]

# Reconstruct the signal using the first 3 harmonics
reconstructed_coeffs = np.zeros(len(all_coeffs), dtype=complex)
for i in range(num_harmonics):
    reconstructed_coeffs += coeffs[i]
reconstructed_signal = np.fft.ifft(reconstructed_coeffs).real
reconstructed_signal += series.mean()


# Plot the original signal and the reconstructed signals for each harmonic
fig, ax = plt.subplots()
ax.plot(series.index, series.values, label='Original signal')
for i in range(num_harmonics):
    smoothed_signal = pd.Series(np.fft.ifft(coeffs[i]).real, index=series.index).rolling(window=5, center=True).mean()
    ax.plot(series.index, smoothed_signal.values, label='Harmonic ' + str(i+1))
ax.plot(series.index, reconstructed_signal, label='Reconstructed signal', linewidth=2, linestyle='--')
ax.set_xlabel('Index')
ax.set_ylabel('Value')
ax.legend()

<matplotlib.legend.Legend at 0x7f6036d384f0>


%reset -f

import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

# Generate random time series data with 20 observations
data = np.random.rand(20)

# Convert the data to a Pandas Series
series = pd.Series(data)


# Perform the decomposition
decomposition = sm.tsa.seasonal_decompose(series, model='additive', period=4)

fig=decomposition.plot();
fig.set_size_inches((8, 3.5));
fig.tight_layout();


import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import itertools


# Set random seed for reproducibility
np.random.seed(123)

# Generate random data with 10 variables
data = np.random.rand(100, 10)

# Add three highly correlated variables
data[:, 2] = 0.8 * data[:, 0] + 0.2 * data[:, 1] + 0.1 * np.random.randn(100)
data[:, 4] = 0.6 * data[:, 0] + 0.4 * data[:, 1] + 0.1 * np.random.randn(100)
data[:, 6] = 0.4 * data[:, 0] + 0.6 * data[:, 1] + 0.1 * np.random.randn(100)

# Convert the data to a Pandas DataFrame
df = pd.DataFrame(data, columns=['Var'+str(i+1) for i in range(10)])


def hyperoptimize(corr_matrix, exclude_features):
    n_features = corr_matrix.shape[0]
    min_high_corr = float('inf')
    best_n_reduce = 0
    best_exclude = exclude_features.copy()

    for n_reduce in range(1, n_features):
        for k in range(n_features):
            for exclude in itertools.combinations(exclude_features, k):
                exclude = list(exclude)
                reduce_features = [i for i in range(n_features) if i not in exclude and i != n_reduce]
                reduced_corr_matrix = corr_matrix.iloc[reduce_features, reduce_features]
                n_high_corr = (reduced_corr_matrix.abs() > 0.5).sum().sum()
                if n_high_corr < min_high_corr:
                    min_high_corr = n_high_corr
                    best_n_reduce = n_reduce
                    best_exclude = exclude

    return best_n_reduce, best_exclude


# Compute the correlation matrix
corr_matrix = df.corr()

# Store the initial correlation matrix
corr_matrix_old = df.corr()

# Set the threshold for the number of high-correlation values
threshold = 7

# Set the initial list of excluded features
exclude_features = []

# Iterate until the number of high-correlation values is below the threshold
while True:
    # Hyperoptimize the number of features to reduce and the excluded features
    n_reduce, exclude_features = hyperoptimize(corr_matrix, exclude_features)

    # Reduce the specified features
    reduce_features = [i for i in range(corr_matrix.shape[0]) if i not in exclude_features and i != n_reduce]
    corr_matrix = corr_matrix.iloc[reduce_features, reduce_features]

    # Count the number of high-correlation values
    n_high_corr = (corr_matrix.abs() > 0.5).sum().sum()

    # If the number of high-correlation values is below the threshold, break the loop
    if n_high_corr <= threshold:
        break


# Create a figure with two axes side by side
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(10, 4))

# Plot the first heatmap on the left axis
sns.heatmap(corr_matrix_old, annot=True, cmap='coolwarm', center=0, square=True, ax=ax1)
ax1.set_title('Old correlation matrix')

# Plot the second heatmap on the right axis
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, square=True, ax=ax2)
ax2.set_title('Updated correlation matrix')

# Show the plot
plt.show()


import numpy as np
import pandas as pd

# Set random seed for reproducibility
np.random.seed(123)

# Generate random data with 10 independent variables and 1 dependent variable
data = np.random.rand(100, 11)

# Set the 3rd, 5th, and 7th independent variables to be strongly correlated with the dependent variable
data[:, 2] = 0.8 * data[:, -1] + 0.1 * np.random.randn(100)
data[:, 4] = 0.6 * data[:, -1] + 0.1 * np.random.randn(100)
data[:, 6] = 0.4 * data[:, -1] + 0.1 * np.random.randn(100)

# Convert the data to a Pandas DataFrame
df = pd.DataFrame(data, columns=['Indep'+str(i+1) for i in range(10)] + ['Dep'])


# Compute the correlation coefficients between the dependent variable and the independent variables
corr_matrix = df.corr().iloc[-1, :-1]
# Filter out the independent variables with low correlation coefficients
selected_vars = corr_matrix[corr_matrix.abs() >= 0.5].index


# Convert the correlation matrix to a DataFrame with two columns: "var" and "corr"
corr_df = corr_matrix.reset_index().rename(columns={'index': 'var', 'Dep': 'corr'})

# Create a new column "selected" and initialize all values to "no"
corr_df['selected'] = 'no'

# Set the "selected" value to "yes" for the selected variables
corr_df.loc[corr_df['var'].isin(selected_vars), 'selected'] = 'yes'

# Display the resulting DataFrame


print(corr_df)

       var      corr selected
0   Indep1  0.256760       no
1   Indep2  0.047934       no
2   Indep3  0.922601      yes
3   Indep4  0.147780       no
4   Indep5  0.891848      yes
5   Indep6  0.006403       no
6   Indep7  0.781090      yes
7   Indep8  0.066287       no
8   Indep9  0.013465       no
9  Indep10 -0.076730       no


from IPython.display import IFrame

# define the URL of the webpage to embed
url = "https://mattbirch.shinyapps.io/Multicollinearity_Visualization_App/"

# create an IFrame object with the specified URL and dimensions
iframe = IFrame(url, width=900, height=400)


# display the IFrame object in the notebook output
display(iframe)


import numpy as np
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Set random seed for reproducibility
np.random.seed(123)

# Generate random data with 10 independent variables
data = np.random.rand(100, 10)

# Add multicollinearity among some of the variables
data[:, 2] = 0.8 * data[:, 0] + 0.2 * data[:, 1] + 0.1 * np.random.randn(100)
data[:, 4] = 0.6 * data[:, 0] + 0.4 * data[:, 1] + 0.1 * np.random.randn(100)
data[:, 6] = 0.4 * data[:, 0] + 0.6 * data[:, 1] + 0.1 * np.random.randn(100)

# Convert the data to a Pandas DataFrame
df = pd.DataFrame(data, columns=['Var'+str(i+1) for i in range(10)])


# Compute the variance inflation factor for each variable
vif = pd.Series([variance_inflation_factor(df.values, i) for i in range(df.shape[1])],
                index=df.columns)

# Select the variables with VIF less than 5
selected_vars = vif[vif < 5].index


# Print the VIF and the selected variables
print('Variance Inflation Factor:')
print(vif)
print('Selected variables:')
print(selected_vars)

Variance Inflation Factor:
Var1     39.607395
Var2     27.014129
Var3     31.176229
Var4      4.314400
Var5     32.245597
Var6      4.181100
Var7     40.196595
Var8      3.633429
Var9      4.434751
Var10     3.954685
dtype: float64
Selected variables:
Index(['Var4', 'Var6', 'Var8', 'Var9', 'Var10'], dtype='object')


%reset -f
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing

# Load the Boston housing dataset
dataset = fetch_california_housing()


from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2, random_state=42)

# Create a Lasso regression object with alpha=0.5
lasso_reg = Lasso(alpha=0.5)

# Fit the Lasso regression model to the training data
lasso_reg.fit(X_train, y_train)

# Use the trained model to make predictions on the testing data
y_pred = lasso_reg.predict(X_test)

# Calculate the mean squared error of the predictions
mse = mean_squared_error(y_test, y_pred)


# Extract the coefficients for each feature
coefficients = lasso_reg.coef_

# Extract the intercept term
intercept = lasso_reg.intercept_

# Print the coefficients, intercept, alpha and mse
df = pd.DataFrame(coefficients.reshape(1,-1))
df['icpt']=intercept


print("Mean Squared Error: ", mse)
df.round(2)

Mean Squared Error:  0.7263312822033787


import matplotlib.pyplot as plt

# Create a range of alpha values to test
alphas = np.logspace(-3, 0, 100)

# Initialize the Lasso regression model
lasso_reg = Lasso()

# Initialize lists to store the coefficients and mean squared errors
coefficients = []
mse_values = []

# Iterate over the range of alpha values and fit the Lasso regression model
for alpha in alphas:
    lasso_reg.set_params(alpha=alpha)
    lasso_reg.fit(X_train, y_train)
    coefficients.append(lasso_reg.coef_)
    mse_values.append(np.mean((lasso_reg.predict(X_test) - y_test)**2))
    
# Find the alpha value with the minimum mean squared error
min_idx = np.argmin(mse_values)
min_alpha = alphas[min_idx]
min_mse = mse_values[min_idx]

# Plot the coefficients against the alpha values
def plot_lasso_coefs():
    plt.figure(figsize=(10, 6))
    plt.plot(alphas, coefficients)
    plt.xscale('log')
    plt.xlabel('alpha')
    plt.ylabel('Coefficients')
    plt.title('Lasso Regression Coefficients')
    plt.legend(dataset.feature_names)
    
# Plot the mean squared error against the alpha values
def plot_lasso_mse():
    plt.figure(figsize=(10, 6))
    plt.plot(alphas, mse_values)
    plt.plot(min_alpha, min_mse, 'ro')
    plt.xscale('log')
    plt.xlabel('alpha')
    plt.ylabel('Mean Squared Error')
    plt.title('Lasso Regression Mean Squared Error')


plot_lasso_coefs()


plot_lasso_mse()


from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_california_housing

# Load the California housing dataset
dataset = fetch_california_housing()


# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2, random_state=42)

# Create a Ridge regression object with alpha=0.5
ridge_reg = Ridge(alpha=0.5)

# Fit the Ridge regression model to the training data
ridge_reg.fit(X_train, y_train)

# Use the trained model to make predictions on the testing data
y_pred = ridge_reg.predict(X_test)

# Calculate the mean squared error of the predictions
mse = mean_squared_error(y_test, y_pred)


# Extract the coefficients for each feature
coefficients = ridge_reg.coef_

# Extract the intercept term
intercept = ridge_reg.intercept_

# Print the coefficients, intercept, alpha and mse
df = pd.DataFrame(coefficients.reshape(1,-1))
df['icpt']=intercept


print("Mean Squared Error: ", mse)
df.round(2)

Mean Squared Error:  0.5635171842142302


# Scale the features to have zero mean and unit variance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

# Create a range of alpha values to test
alphas = np.logspace(0, 4.5, 100)

# Initialize the Lasso regression model
ridge_reg = Ridge()

# Initialize lists to store the coefficients and mean squared errors
coefs = []
mse_values = []

# Iterate over the range of alpha values and fit the Lasso regression model
for alpha in alphas:
    ridge_reg.set_params(alpha=alpha)
    ridge_reg.fit(X_train, y_train)
    coefs.append(ridge_reg.coef_)
    mse_values.append(np.mean((ridge_reg.predict(X_test) - y_test)**2))
    

# Find the alpha value with the minimum mean squared error
min_idx = np.argmin(mse_values)
min_alpha = alphas[min_idx]
min_mse = mse_values[min_idx]

# Plot the coefficients against the alpha values
def plot_ridge_coefs():
    plt.figure(figsize=(10, 6))
    ax = plt.gca()
    ax.plot(alphas, coefs)
    ax.set_xscale('log')
    plt.xlabel('alpha')
    plt.ylabel('Coefficients')
    plt.title('Ridge Regression Coefficients')
    plt.legend(dataset.feature_names)
    plt.axis('tight')


plot_ridge_coefs()


# Plot the mean squared error against the alpha values
def plot_ridge_mse():
    plt.figure(figsize=(10, 6))
    plt.plot(alphas, mse_values)
    plt.plot(min_alpha, min_mse, 'ro')
    plt.xscale('log')
    plt.xlabel('alpha')
    plt.ylabel('Mean Squared Error')
    plt.title('Ridge Regression Mean Squared Error')


plot_ridge_mse()


# Load the California housing dataset
from sklearn.datasets import fetch_california_housing
dataset = fetch_california_housing()

# Extract the features and target variable
X = dataset.data
y = dataset.target

# Add a column of ones for the intercept
X = np.concatenate([np.ones((len(y), 1)), X], axis=1)

# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Set the tuning parameter
alpha = 0.5

# Fit the Ridge regression model to the training data
beta = ridge_regression(X_train, y_train, alpha)

# Use the trained model to make predictions on the testing data
y_pred = X_test @ beta

# Calculate the mean squared error of the predictions
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)

# Print the coefficients, intercept, alpha and mse
df = pd.DataFrame(beta.reshape(1,-1))


from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing

# Load the California housing dataset
dataset = fetch_california_housing()


# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2, random_state=42)

# Create an Elastic Net object with alpha=0.5, l1_ratio=0.5
enet = ElasticNet(alpha=0.5, l1_ratio=0.5)

# Fit the Elastic Net model to the training data
enet.fit(X_train, y_train)

# Use the trained model to make predictions on the testing data
y_pred = enet.predict(X_test)

# Calculate the mean squared error of the predictions
mse = mean_squared_error(y_test, y_pred)


# Extract the coefficients for each feature
coefficients = enet.coef_

# Extract the intercept term
intercept = enet.intercept_

# Print the coefficients, intercept, alpha and mse
df = pd.DataFrame(coefficients.reshape(1,-1))
df['icpt']=intercept


print("Mean Squared Error: ", mse)
df.round(2)

Mean Squared Error:  0.6868730783041608


import matplotlib.pyplot as plt

# Create a range of alpha values to test
alphas = np.logspace(-3, 0, 100)
l1_ratios = np.linspace(0, 1, 100)

# Initialize the Lasso regression model
enet_reg = ElasticNet()

# Initialize lists to store the coefficients and mean squared errors
coefficientsa = []
coefficientsl1 = []
mse_valuesa = []
mse_valuesl1 = []

# Iterate over the range of alpha values and fit the Lasso regression model
for alpha in alphas:
    enet_reg.set_params(alpha=alpha)
    enet_reg.fit(X_train, y_train)
    coefficientsa.append(enet_reg.coef_)
    mse_valuesa.append(np.mean((enet_reg.predict(X_test) - y_test)**2))

# Iterate over the range of alpha values and fit the Lasso regression model
for l1_ratio in l1_ratios:
    enet_reg.set_params(l1_ratio=l1_ratio)
    enet_reg.fit(X_train, y_train)
    coefficientsl1.append(enet_reg.coef_)
    mse_valuesl1.append(np.mean((enet_reg.predict(X_test) - y_test)**2))
    
# Find the alpha value with the minimum mean squared error
min_idx = np.argmin(mse_valuesa)
min_alpha = alphas[min_idx]
min_msea = mse_valuesa[min_idx]

# Find the alpha value with the minimum mean squared error
min_idx = np.argmin(mse_valuesl1)
min_l1ratio = l1_ratios[min_idx]
min_msel1 = mse_valuesl1[min_idx]

# Plot the coefficients against the alpha values
def plot_enet_coef_alpha():
    plt.figure(figsize=(10, 6))
    plt.plot(alphas, coefficientsa)
    plt.xscale('log')
    plt.xlabel('alpha')
    plt.ylabel('Coefficients')
    plt.title('ElasticNet Regression Coefficients')
    plt.legend(dataset.feature_names)
    
# Plot the mean squared error against the alpha values
def plot_enet_mse_alpha():
    plt.figure(figsize=(10, 6))
    plt.plot(alphas, mse_valuesa)
    plt.plot(min_alpha, min_msea, 'ro')
    plt.xscale('log')
    plt.xlabel('alpha')
    plt.ylabel('Mean Squared Error')
    plt.title('ElasticNet Regression Mean Squared Error')   
    
# Plot the coefficients against the alpha values
def plot_enet_coef_l1ratio():
    plt.figure(figsize=(10, 6))
    plt.plot(l1_ratios, coefficientsl1)
    plt.xscale('linear')
    plt.xlabel('L1 ratio')
    plt.ylabel('Coefficients')
    plt.title('ElasticNet Regression Coefficients')
    plt.legend(dataset.feature_names)

# Plot the mean squared error against the alpha values
def plot_enet_mse_l1ratio():
    plt.figure(figsize=(10, 6))
    plt.plot(l1_ratios, mse_valuesl1)
    plt.plot(min_l1ratio, min_msel1, 'ro')
    plt.xscale('linear')
    plt.xlabel('L1 ratio')
    plt.ylabel('Mean Squared Error')
    plt.title('ElasticNet Regression Mean Squared Error')

/home/junior/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:647: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 6.284e+03, tolerance: 2.207e+00 Linear regression models with null weight for the l1 regularization term are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.
  model = cd_fast.enet_coordinate_descent(


plot_enet_coef_alpha()


plot_enet_mse_alpha()


 plot_enet_coef_l1ratio()


plot_enet_mse_l1ratio()

linear (lin)	logarithmic (lgn)	exponential (xpy)	power (pow)
$$Y=a_0+a_1.X$$	$$Y=a_0+a_1.\ln{X}$$ $$Z=\ln{X}$$ $$Y=a_0+a_1.Z$$	$$Y=e^{a_0+a_1.X}$$ $$Q=\ln{Y}$$ $$Q=a_0+a_1.X$$	$$Y=a_0.X^{a_1}$$ $$Z=\ln{X}$$ $$Q=\ln{Y}$$ $$Q=a_0+a_1.Z$$
reciprocal (rex)	reverse reciprocal (rey)	quadratic sum (sqr)	sine (snx)
$$Y=a_0+\frac{a_1}{X}$$ $$Z=\frac{1}{𝑋}, {X}\neq{0}$$ $$Y=a_0+a_1.Z$$	$$Y=\frac{1}{a_0+a_1.X}$$ $$Q=\frac{1}{Y}, {Y}\neq{0}$$ $$Q=a_0+a_1.X$$	$$Y={(a_0+a_1.X)^2}$$ $$Q=\sqrt{Y}$$ $$Q=\|a_0+a_1.X\|$$	$${Y=a_0+a_1.\sin{X}}$$ $$Z=\sin{X}$$ $$Y=a_0+a_1.Z$$

	color_blue	color_green	color_red	size_large	size_medium	size_small
0	0	0	1	0	0	1
1	1	0	0	0	1	0
2	0	1	0	0	0	1
3	0	0	1	1	0	0
4	0	1	0	0	1	0
5	1	0	0	1	0	0

	Original data	Rolling mean	Rolling standard deviation	Rolling maximum
0	0.076313	NaN	NaN	NaN
1	0.264040	NaN	NaN	NaN
2	0.675782	0.338712	0.306631	0.675782
3	0.068876	0.336233	0.309826	0.675782
4	0.806467	0.517042	0.393585	0.806467
5	0.705469	0.526937	0.399894	0.806467
6	0.756620	0.756185	0.050500	0.806467
7	0.018057	0.493382	0.412437	0.756620
8	0.089027	0.287901	0.407471	0.756620
9	0.579511	0.228865	0.305734	0.579511
10	0.527292	0.398610	0.269375	0.579511
11	0.970188	0.692330	0.242044	0.970188
12	0.485930	0.661137	0.268444	0.970188
13	0.957106	0.804408	0.275888	0.970188
14	0.128065	0.523700	0.415809	0.957106
15	0.372937	0.486036	0.425935	0.957106
16	0.881238	0.460746	0.384188	0.881238
17	0.867801	0.707325	0.289667	0.881238
18	0.971188	0.906742	0.056215	0.971188
19	0.874550	0.904513	0.057841	0.971188

	data
date
2020-01-05	59.600000
2020-01-12	70.857143
2020-01-19	42.857143
2020-01-26	95.000000

	data
date
2020-01-01	54.0
2020-01-02	67.0
2020-01-03	42.0
2020-01-04	NaN
2020-01-05	60.0
2020-01-06	22.0
2020-01-07	99.0

	Company	Revenue	COGS	EBITDA	Gross Profit	Gross Margin	EBITDA Margin	Net Income	Net Margin	ROE	ROA
0	A	100000	70000	30000	30000	0.30	0.30	-19000.00	-0.19	-0.19	-0.19
1	B	50000	25000	25000	25000	0.50	0.50	12500.00	0.25	0.25	0.25
2	C	75000	50000	25000	25000	0.33	0.33	-8333.33	-0.11	-0.11	-0.11
3	D	125000	80000	45000	45000	0.36	0.36	-6200.00	-0.05	-0.05	-0.05

	Date	Data	Year	Month	Day	Hour
0	2022-01-01 00:00:00	0.114295	2022	1	1	0
1	2022-01-01 01:00:00	0.499400	2022	1	1	1
2	2022-01-01 02:00:00	0.316746	2022	1	1	2
3	2022-01-01 03:00:00	0.901192	2022	1	1	3
4	2022-01-01 04:00:00	0.531030	2022	1	1	4
5	2022-01-01 05:00:00	0.792617	2022	1	1	5
6	2022-01-01 06:00:00	0.100412	2022	1	1	6
7	2022-01-01 07:00:00	0.187317	2022	1	1	7
8	2022-01-01 08:00:00	0.786790	2022	1	1	8
9	2022-01-01 09:00:00	0.497147	2022	1	1	9
10	2022-01-01 10:00:00	0.138009	2022	1	1	10
11	2022-01-01 11:00:00	0.681217	2022	1	1	11
12	2022-01-01 12:00:00	0.251439	2022	1	1	12
13	2022-01-01 13:00:00	0.518483	2022	1	1	13
14	2022-01-01 14:00:00	0.559895	2022	1	1	14
15	2022-01-01 15:00:00	0.542000	2022	1	1	15
16	2022-01-01 16:00:00	0.265538	2022	1	1	16
17	2022-01-01 17:00:00	0.068318	2022	1	1	17
18	2022-01-01 18:00:00	0.548399	2022	1	1	18
19	2022-01-01 19:00:00	0.909004	2022	1	1	19

age_bins_unsupervised	low	medium	high
age_bins_supervised
28800	14	50	0
53039	0	8	28

KBC Academy

Topics in Advanced Machine Learning

About the course

Feature engineering for structured data

Angel Marchev, Jr.

Special guest: Alexander Efremov

Agenda

Workflow 1/3

Workflow 2/3

Workflow 3/3

Preliminary remarks

Preliminary remarks

Feature engineering in Panel data

Outliers detection & manipulation

Outliers detection & manipulation

Reduce low varying features

Derive domain/task specific features

Re-scaling

Non-linear transformations

Encoding

Encoding

Encoding

Binning

Feature engineering in Time series

Deriving lagged variables

Rolling window statistics

Datetime index operations

Datetime index operations

Datetime index operations

Outliers low frequency filter

Harmonics decomposition

Harmonics decomposition

Variables selection/reduction

Variables selection/reduction

Multicollinearity

Multicollinearity

Regularization

L1 Regularization

L2 Regularization

L1 & L2 Regularization

Factor analysis