# TODOs

1. Data Generation:
    - numerical data generation - improvements in distribution ***DONE***
    - business rules adjustments (only after numerical data generation is fixed)
    - check the distribution of each numerical feature of the merged data ***DONE***
    - categorical data - add a check (with Kolmogorov Smirnov or another one) that tells you if the distribution of the synthetic data is ok - ***DONE***
    - Marchev added pers_exp as a categorical column; it also exists in the numerical cols. Our task is to first generate the distribution (and values) for this column (assuming normal distribution) and based on it generate the rest.
<br>
<br>
2. Feature Engineering:
    - Generate new features (**custom made**, statistical) - ***Girls***
    - Balance data 
        - research other methods beside SMOTE; **Multiclass problems handling - should we resample for each target class individually or not. Describe all possible prediction scenarious** - ***ALL***
<br>
<br>
3. Feature Selection 
    - research methods for numerical and categorical feature selections
<br>
<br>
4. Modeling:
    - try different models - with at least 5k examples!!! 
        - black box and explainable ones
    - visualizations - compare performances
    - try to explain the black box models (OPTIONAL)
    - hyperparam optimization (OPTIONAL)

# Inputs and tools

## Imports

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.metrics import accuracy_score, confusion_matrix,roc_curve, roc_auc_score, precision_score, recall_score, precision_recall_curve
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier



# from fitter import Fitter, get_common_distributions, get_distributions
import random

import seaborn as sns
import matplotlib.pyplot as plt

# Increase the maximum number of rows and columns to be displayed
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 1000)

## Categorical data distributions

In [None]:
dists = {
#     'pers_exp':{'labels':['group1', 'group2', 'group3', 'group4'], 'values':[0.10,0.44,0.33,0.14]},
    'sex':{'labels':['M', 'F'], 'values':[0.4854368932,0.5145631068]},
    'lv_educ':{'labels':['Incomplete', 'Primary', 'Basic', 'Secondary', 'Higher'], 'values':[0.0595,0.07788016474,0.2309254283,0.4359496722,0.1957067711]},
    'empl_stat':{'labels':['Employers', 'Self-employed', 'Employed in private sector', 'Employed in public sector', 'Unpaid family workers'], 'values':[0.03631598652,0.07272557095,0.6708785723,0.2126544365,0.00742543367]},
    'marit_stat':{'labels':['Single', 'Married', 'Divorced', 'Widowed'], 'values':[0.397,0.443,0.058,0.102]},
    'house_memb':{'labels':['1', '2', '3', '4', '5', '6', '7+'], 'values':[0.1805,0.3778,0.2387,0.1157,0.0525,0.0238,0.011]},
    'chil_u_18_y':{'labels':['No children under 18', 'One child under 18', 'Two children under 18', 'Three children under 18', 'Four children under 18', 'Five children under 18', 'Six or more children under 18'], 'values':[0.422602157,0.36552047,0.183222339,0.020674764,0.004993779,0.001875149,0.001111341]},
    'nation':{'labels':['Bulgaria', 'EU', 'Other'], 'values':[0.9950198043,0.001146570676,0.003833625045]},
    'religion':{'labels':['Protestant', 'Catholic', 'Orthodox', 'Muslim', 'Other', 'No religion', 'I do not identify myself'], 'values':[0.011,0.008,0.76,0.1,0.002,0.047,0.072]},
    'soc_econ_stat':{'labels':['Economically active', 'Economically inactive'], 'values':[0.6151643031,0.3848356969]},
    'prof_ind':{'labels':['Agriculture, forestry and fisheries', 'Mining and processing industry', 'Utilities (electricity distribution and water supply)', 'Construction', 'Trade, automobile and motorcycle repair', 'Transportation, warehousing and mail', 'Hospitality and restaurant services', 'Creation and distribution of information and creative products, Telecommunications', 'Financial and administrative activities', 'Public administration', 'Education and research', 'Human health and social work', 'Other activities'], 'values':[0.03090815115,0.2353,0.029,0.05523651408,0.1645618594,0.06439111505,0.05161626582,0.03936261795,0.07356911161,0.04836124844,0.104946474,0.06006423384,0.04269692032]},
    'prof_stat':{'labels':['Management contract', 'Employment contract', 'Civil contract', 'Self-employed', 'Unemployed', 'Pensioner'], 'values':[0.01783393631,0.4732428049,0.02497602302,0.0385148509,0.167699009,0.277733376]},
    'count_house':{'labels':['0', '1', '2+'], 'values':[0.37,0.6,0.03]},
    'own_field':{'labels':['YES', 'NO'], 'values':[0.2621335023,0.737866497676384]},
    'num_car_house':{'labels':['0', '1', '2', '3+'], 'values':[0.5714285714,0.36,0.06428571429,0.004285714286]},
    'own_rent_house':{'labels':['my own', 'rented'], 'values':[0.843,0.157]},
    'edu':{'labels':['Educational Sciences', 'Humanities', 'Social, Economic and Legal Sciences', 'Natural Sciences, Mathematics and Informatics', 'Technical Sciences', 'Agricultural Sciences and Veterinary Medicine', 'Health and Sports', 'Arts', 'Security and Defense'], 'values':[0.07591254907,0.0461889827,0.5266633332,0.04571641724,0.1533297557,0.01776640163,0.0930038303,0.02247374859,0.01891291637]},
    'temp':{'labels':['Choleric', 'Phlegmatic', 'Sanguine', 'Melancholic'], 'values':[0.38,0.11,0.23,0.28]},
    'invest_exp':{'labels':['0', '1-5', '6-10', '11-15', '16-25'], 'values':[0.7,0.2,0.06,0.03,0.01]},
    'shares':{'labels':['YES', 'NO'], 'values':[0.003394353314,0.9966056467]},
    'corp_oblig':{'labels':['YES', 'NO'], 'values':[0.0003792213936,0.9996207786]},
    'oth':{'labels':['YES', 'NO'], 'values':[0.000592597502012084,0.999407402497988]},
    'inv_fund':{'labels':['YES', 'NO'], 'values':[0.06491199709,0.9350880029]},
    'cash':{'labels':['YES', 'NO'], 'values':[0.04105169923,0.9589483008]},
    'crypto':{'labels':['YES', 'NO'], 'values':[0.003284135938,0.9967158641]},
    'gov_bond':{'labels':['YES', 'NO'], 'values':[0.06835666691,0.9316433331]},
    'deposits':{'labels':['YES', 'NO'], 'values':[0.8180293286,0.1819706714]},
    'banking':{'labels':['Online', 'Offline'], 'values':[0.09,0.91]},
    'bk_oprat':{'labels':['Up to 7', 'From 8 to 10', 'From 11 to 13', 'From 14 to 18', 'From 19 to more'], 'values':[0.0084,0.2424,0.4729,0.2615,0.0148]},
    'bk_dc':{'labels':['Under one', 'One', 'Two', 'Three'], 'values':[0.01,0.57,0.38,0.04]},
    'bk_cc':{'labels':['YES', 'NO'], 'values':[0.17,0.83]},
    'bk_acc':{'labels':['YES', 'NO'], 'values':[0.8634087377,0.1365912623]},
    'ins_prop':{'labels':['YES', 'NO'], 'values':[0.05,0.95]},
    'ins_life':{'labels':['YES', 'NO'], 'values':[0.09,0.91]},
    'ins_casco':{'labels':['YES', 'NO'], 'values':[0.03,0.97]},
    'health_ins':{'labels':['YES', 'NO'], 'values':[0.02,0.98]},
    'overdraft':{'labels':['YES', 'NO'], 'values':[0.19,0.81]},
    'cons_cred':{'labels':['YES', 'NO'], 'values':[0.26,0.74]},
    'mortgage':{'labels':['YES', 'NO'], 'values':[0.02,0.98]},
    'car_leas':{'labels':['YES', 'NO'], 'values':[0.2,0.8]},
    'pens_ins':{'labels':['YES', 'NO'], 'values':[0.11,0.89]},
    'overdraft_app':{'labels':['YES', 'NO'], 'values':[0.2439,0.7561]},
    'cons_cred_app':{'labels':['YES', 'NO'], 'values':[0.305299502487562,0.694700497512438]},
    'mortgage_app':{'labels':['YES', 'NO'], 'values':[0.03,0.97]},
    'bk_cc_app':{'labels':['YES', 'NO'], 'values':[0.21,0.79]}
}

In [None]:
len(dists)

## Correlation of Numerical data 

In [None]:
corr = {
    'features': ['age', 'ind_risk', 'income', 'pers_exp', 'house_exp', 'taxes', 'transp_telecom', 'hobby'],
    'age': [1, -0.00665947056405372, 0.00291644965339247, 0.0107779942638097, 0.00698674581731255, 0.00729153655132963, 0.0099866509330216, 0.00931630696561133],
    'ind_risk': [-0.00665947056405372, 1, 0.0039918072709289, 0.00806259039194059, 0.00457023635440603, 0.0061985340641631, 0.00768699810849585, -0.00332322616613201],
    'income': [0.00291644965339247, 0.0039918072709289, 1, 0.560949334881676, 0.58892666343229, 0.581907424628933, 0.562946509689962, 0.352350802339294],
    'pers_exp': [0.0107779942638097, 0.00806259039194059, 0.560949334881676, 1, 0.928449923861951, 0.929598634668897, 0.934775947642248, 0.714298364869941],
    'house_exp': [0.00698674581731255, 0.00457023635440603, 0.58892666343229, 0.928449923861951, 1, 0.93031279279417, 0.927846735467478, 0.679286362990223],
    'taxes': [0.00729153655132963, 0.0061985340641631, 0.581907424628933, 0.929598634668897, 0.93031279279417, 1, 0.92920510128812, 0.689442053350162],
    'transp_telecom': [0.0099866509330216, 0.00768699810849585, 0.562946509689962, 0.934775947642248, 0.927846735467478, 0.92920510128812, 1, 0.714114127908189],
    'hobby': [0.00931630696561133, -0.00332322616613201, 0.352350802339294, 0.714298364869941, 0.679286362990223, 0.689442053350162, 0.714114127908189, 1]
}



means = [50.58, 0.6039, 13579.30, 5305.00, 2260, 1500, 1335, 1740]

## Extract distributions from dictionary

In [None]:
def extract_dists(x, dists):
    '''
    A function to extract distributions from the dictionary dists,where:
    x is the name of the feature to extract in ''
    dists is the dictionary with all distributions
    '''
    import pandas as pd
    column_names = dists[x]['labels']
    values = [dists[x]['values']]
    pd_df = pd.DataFrame(data=values, columns=column_names)
    pd_df.index = pd.Index([x])
    return pd_df

## Convert the dictionary with correlation matrix to dataframe

In [None]:
def corr2df(corr):
    '''
    A function to create correlation dataframe from dictionary corr, where
    corr is the dictionary with the correlation matrix
    '''
    import pandas as df
    corr_df = pd.DataFrame(corr)
    corr_df.set_index('features', inplace=True)
    corr_df.index.name=None
    return corr_df

# Data Generation (Data synthesis)

## Categorical Data

In [None]:
import numpy as np

# Number of rows in the synthetic dataset
num_rows = 250000

# Create the synthetic dataset
dataset = {}
for key, value_dict in dists.items():
    labels = value_dict['labels']
    probabilities = value_dict['values']
    
    # Normalize probabilities to ensure they sum to 1
    normalized_probabilities = probabilities / np.sum(probabilities)
    
    sampled_values = np.random.choice(labels, size=num_rows, p=normalized_probabilities)
    dataset[key] = sampled_values

# Printing the first 10 rows of the synthetic dataset
# for key in dataset:
#     print(f"{key}: {dataset[key][:10]}")


In [None]:
import pandas as pd

cat_df = pd.DataFrame(dataset)

### Perform a Kolmogorov-Smirnov distribution test

In [None]:
def kolmogorov_smirnov_test(categorical_data, old_distributions):
    
    from scipy.stats import ks_2samp
    
#     old = {'sex': {'labels': ['M', 'F'], 'values': [0.4854368932, 0.5145631068]}}
#     new = {'sex': {'labels': ['M', 'F'], 'values': [0.476, 0.524]}}

    statistics = {}
    
    for col in categorical_data:
        
        print()
        print("Reviewed column: ", col)
        # Extract the values for each category in the old and new distributions
        
        
        old_values = old_distributions[col]['values']
        old_labels = old_distributions[col]['labels']
        
        new_data_labels = list(categorical_data[col].value_counts(normalize=True).index)
        
        old_dis_data = []
        new_dis_data = []
        
        
        for i in range(len(old_labels)):
            
            label = old_labels[i]
            old_dis_data.append(old_distributions[col]['values'][i])
            
            new_data_value = categorical_data[col].value_counts(normalize=True)[label]
            new_dis_data.append(new_data_value)
        
        old_dis_data_array = np.array(old_dis_data)
        new_dis_data_array = np.array(new_dis_data)
        
        print(f"old dist: {old_dis_data_array}")
        print(f"new dist: {new_dis_data_array}")
        # Perform the KS test
        ks_statistic, p_value = ks_2samp(old_dis_data_array, new_dis_data_array)

        # Define the significance level (alpha) to test against the p-value
        alpha = 0.05

        # Check if the p-value is less than the significance level
        
        
        if p_value < alpha:
            print(f"Distributions are different for column: {col}, p_value={1-p_value}")
            
            statistics[col] = True
        else:
            print(f"Distributions are similar for column: {col}, p_value={1-p_value}")
            
            statistics[col] = False
    
    print("......................")
    print()
    return statistics

In [None]:
categorical_ks_statistics = kolmogorov_smirnov_test(categorical_data=cat_df,
                                                   old_distributions=dists)

### Save

In [None]:
# cat_df.to_csv("df_cat_No_Br_250k_v2.csv")

Notes: 
- generate all cat data (including pers_exp
- use it's values as a basis when generating the numerical data (all other features in the corr matrix should be generated based on their relationship with the pers_exp) and the MEANS provided in the inputs

## Numerical Data

In [None]:
corr.keys()

In [None]:
l_corr_matrix = []

for k, v in corr.items():
    if k == "features":
        continue
    else:
        l_corr_matrix.append(v)

        
arr_corr_matrix = np.array(l_corr_matrix)

In [None]:
print(arr_corr_matrix)

### Generate synthetic data (Copula)

In [None]:
feature_bounds = []

possible_values = {
    'age': [20, 86],
    'ind_risk': [0, 1],
    'income': [0, 150000],
    'pers_exp': [0, 6000],
    'house_exp': [0, 4000],
    'taxes': [0, 2500],
    'transp_telecom': [0, 2500],
    'hobby': [0, 3000],
}



for v in possible_values.values():
    
    feature_bounds.append(tuple(v))

        
# mean = []

# for k, v in possible_values.items():
#     avg = sum(v)/len(v)
#     mean.append(avg)  

    
# std = [int(v/3) for v in mean]
std = [3]*8
    
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import MinMaxScaler

def generate_synthetic_data_with_bounds(correlation_matrix, num_samples, feature_bounds, means=means, std=std):
    num_features = correlation_matrix.shape[0]
    lower_bounds, upper_bounds = zip(*feature_bounds)
    scaler = MinMaxScaler(feature_range=(0, 1))
    
    # Check if the correlation matrix is valid (symmetric and positive definite)
    if not np.allclose(correlation_matrix, correlation_matrix.T):
        raise ValueError("Correlation matrix must be symmetric.")
    if not np.all(np.linalg.eigvals(correlation_matrix) > 0):
        raise ValueError("Correlation matrix must be positive definite.")
    
    # Generate synthetic data using multivariate normal distribution
    mean = np.zeros(num_features)
    synthetic_data = np.random.multivariate_normal(mean, correlation_matrix, num_samples)
    
    # Apply Gaussian copula to maintain correlation structure
    synthetic_data = norm.cdf(synthetic_data, loc=mean, scale=std)
    
    # Scale the data to the specified bounds for each feature
    for i in range(num_features):
        synthetic_data[:, i] = lower_bounds[i] + synthetic_data[:, i] * (upper_bounds[i] - lower_bounds[i])
    
    return synthetic_data

# Example usage:
correlation_matrix = np.array(l_corr_matrix)

num_samples = 250000

synthetic_data = generate_synthetic_data_with_bounds(correlation_matrix, num_samples, feature_bounds)
print(synthetic_data)


In [None]:
import pandas as pd

adjusted_df = pd.DataFrame(synthetic_data, columns=possible_values.keys())
adjusted_df

In [None]:
pd.DataFrame(correlation_matrix)

### Plot distributions

In [None]:
import plotly.express as px

def plot_distribution(data, column_name):
    """
    Plots the distribution of a pandas column using Plotly.

    Parameters:
        data (pd.DataFrame): The pandas DataFrame containing the data.
        column_name (str): The name of the column to plot.

    Returns:
        None
    """
    # Ensure the column exists in the DataFrame
    if column_name not in data.columns:
        raise ValueError(f"Column '{column_name}' not found in the DataFrame.")

    # Use Plotly Express to plot the distribution
    fig = px.histogram(data, x=column_name, nbins=50, title=f'Distribution of {column_name}')
    fig.show()


In [None]:
for col in adjusted_df.columns:
    plot_distribution(adjusted_df, col)

### Plot the two correlation matrices

In [None]:
possible_values.keys()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ... (Code from the previous answer)

# Calculate the correlation matrix for the adjusted DataFrame
correlation_matrix_adjusted = adjusted_df.corr()


# Display the correlation matrix
print("Adjusted Correlation Matrix:")
print(correlation_matrix_adjusted)

# Convert the original correlation data (corr) to a DataFrame
original_corr_df = pd.DataFrame(corr)
original_corr_df.set_index('features', inplace=True)

# Display the original correlation data
print("\nOriginal Correlation Matrix:")
print(original_corr_df)

# Plot the correlation matrix heatmaps for both adjusted and original data side by side
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
fig.suptitle("Comparison of Correlation Matrices", fontsize=16)

# Adjusted correlation matrix heatmap
axes[0].imshow(correlation_matrix_adjusted, cmap='coolwarm', interpolation='nearest')
axes[0].set_xticks(np.arange(len(correlation_matrix_adjusted)))
axes[0].set_yticks(np.arange(len(correlation_matrix_adjusted)))
axes[0].set_xticklabels(correlation_matrix_adjusted.columns, rotation=45)
axes[0].set_yticklabels(correlation_matrix_adjusted.columns)
axes[0].set_title("Adjusted Correlation Matrix")

# Original correlation matrix heatmap
axes[1].imshow(original_corr_df, cmap='coolwarm', interpolation='nearest')
axes[1].set_xticks(np.arange(len(original_corr_df)))
axes[1].set_yticks(np.arange(len(original_corr_df)))
axes[1].set_xticklabels(original_corr_df.columns, rotation=45)
axes[1].set_yticklabels(original_corr_df.columns)
axes[1].set_title("Original Correlation Matrix")

plt.show()


In [None]:
# Function to measure the distance between two correlation matrices using Frobenius norm
def correlation_distance(matrix1, matrix2):
    return np.linalg.norm(matrix1 - matrix2, ord='fro')


current_distance = correlation_distance(matrix1=correlation_matrix_adjusted, matrix2=original_corr_df)

In [None]:
current_distance

## Save

In [None]:
# adjusted_df.to_csv("adjusted_df_num_No_BR_250k_v3.csv")

## Combine the two datasets

In [None]:
num_df = pd.read_csv("adjusted_df_num_No_BR_250k.csv", index_col=[0])
cat_df = pd.read_csv("df_cat_No_Br_250k.csv", index_col=[0])

In [None]:
df_merged = pd.concat([num_df, cat_df], axis=1)
df_merged.shape

In [None]:
df_merged.head()

## Apply the business rules

In [None]:
df_merged.columns

In [None]:
# 100 реда, 80 със статус Married и house_memb > 2, 20 статус Married и house_memb <=2;
# задачата е да променим статуса на тия 20 реда.

# т.е. 

100 rows, 20 non_comply, replace 20 with complying values, concatenate back to the 100, out of those 100 select other 20 that don't correspond to the independent feature value of the B_RULE and replace the corresponding values with  

Plan to apply business rules:

- list all columns that are going to be affected
- separate the rest (will preserve their distributions and num of examples
- sort the rules in a way that the dependent features are repeated (if applicable). In this way we are only generating new data only after all operations on this column are done. 
- business rules should be applied as follows:

    - Select the first and independent and dependent features
        - apply the business rule
            - save the rows that comply with the business rule - comply_df
            - save the rows that DO NOT comply - non_comply_df
                - out of them concatenate the independent feature to the comply_df (this will leave n number of NaN's on the dependent feature side)
                - count the number of NaN's left (same as num of rows in the non_comply_df
                - use it as a value for n_samples
                - generate that many n_samples for the same dependent feature (problem, how to assure that it follows the same old distribution??)


- Option 2:
    - instead of replacing the values with NaN's, replace them with other values that comply with the business rules
        - HOW: save the num of rows that DO NOT comply with the business rules
        - generate a unique list of all other values that comply
        - replace the NaN's (or the current value in the non_comply_df) with random distribution from the other values
        - go to the comply_df and also replace the same number of value

In [None]:
import pandas as pd


# Business rules as a list of dictionaries
full_business_rules = [
    {"Independent feature": "marit_stat", "Independent feature value": "=='Married'", "Dependent feature": "house_memb", "Dependent feature value filter": ">'2'", "Note": "The number of household members in family households is more likely to be greater than 2"},
    {"Independent feature": "prof_ind", "Independent feature value": "=='Financial and administrative activities'", "Dependent feature": "invest_exp", "Dependent feature value filter": ">'0'", "Note": "They are more likely to own a bank account"},
    {"Independent feature": "age", "Independent feature value": "<25", "Dependent feature": "invest_exp", "Dependent feature value filter": "=='0'", "Note": "Under 24s are less likely to have investment experience. Between 35-44 and 45-54 are more likely to have extensive investment experience"},
    {"Independent feature": "age", "Independent feature value": "<25", "Dependent feature": "lv_educ", "Dependent feature value filter": "!='Higher'", "Note": "Under 24s are less likely to have a college degree"},
    {"Independent feature": "age", "Independent feature value": "<25", "Dependent feature": "chil_u_18_y", "Dependent feature value filter": "<'2'", "Note": "From 20-24, it is less likely to have more than 1 child under 18"},
    {"Independent feature": "invest_exp", "Independent feature value": ">'0'", "Dependent feature": "deposits", "Dependent feature value filter": "=='Y'", "Note": "They are more likely to own a bank account"},
    {"Independent feature": "shares", "Independent feature value": "=='Y'", "Dependent feature": "invest_exp", "Dependent feature value filter": ">'0'", "Note": "Previous investment experience in years"},
    {"Independent feature": "corp_oblig", "Independent feature value": "=='Y'", "Dependent feature": "invest_exp", "Dependent feature value filter": ">'0'", "Note": "Previous investment experience in years"},
    {"Independent feature": "oth", "Independent feature value": "=='Y'", "Dependent feature": "invest_exp", "Dependent feature value filter": ">'0'", "Note": "Previous investment experience in years"},
    {"Independent feature": "inv_fund", "Independent feature value": "=='Y'", "Dependent feature": "invest_exp", "Dependent feature value filter": ">'0'", "Note": "Previous investment experience in years"},
    {"Independent feature": "cash", "Independent feature value": "=='Y'", "Dependent feature": "invest_exp", "Dependent feature value filter": ">'0'", "Note": "Previous investment experience in years"},
    {"Independent feature": "crypto", "Independent feature value": "=='Y'", "Dependent feature": "invest_exp", "Dependent feature value filter": ">'0'", "Note": "Previous investment experience in years"},
    {"Independent feature": "gov_bond", "Independent feature value": "=='Y'", "Dependent feature": "invest_exp", "Dependent feature value filter": ">'0'", "Note": "Previous investment experience in years"},
    {"Independent feature": "age", "Independent feature value": "<25", "Dependent feature": "bk_acc", "Dependent feature value filter": "=='N'", "Note": "Under 24s are less likely to have a checking account"},
    {"Independent feature": "age", "Independent feature value": "<18", "Dependent feature": "bk_acc", "Dependent feature value filter": "=='N'", "Note": "Under 18 is not possible to have a current account"},
    {"Independent feature": "lv_educ", "Independent feature value": "=='Higher'", "Dependent feature": "income", "Dependent feature value filter": ">27601", "Note": "A higher level of education implies earnings in the upper range"},
    {"Independent feature": "chil_u_18_y", "Independent feature value": ">'1'", "Dependent feature": "house_memb", "Dependent feature value filter": ">'3'", "Note": "The number of household members is directly dependent on the number of children under 18"},
    {"Independent feature": "lv_educ", "Independent feature value": "=='Higher'", "Dependent feature": "soc_econ_stat", "Dependent feature value filter": "=='Economically active'", "Note": "A higher level of education implies an economically active status"},
    {"Independent feature": "income", "Independent feature value": ">27601", "Dependent feature": "taxes", "Dependent feature value filter": ">2500", "Note": "Earnings in the upper range correspond to higher taxes and insurance"},
]


# Function to apply a single business rule
def apply_business_rule(rule, dataframe):
    independent_feature = rule["Independent feature"]
    independent_feature_value = rule["Independent feature value"]
    dependent_feature = rule["Dependent feature"]
    dependent_feature_value_filter = rule["Dependent feature value filter"]

    # Construct the filter condition dynamically using f-strings
#     filter_condition = f"(dataframe['marit_stat'] == 'Married') & (dataframe['house_memb'] > {dependent_feature_value_filter})"
#     filter_condition = f"(dataframe['age'] {independent_feature_value}) & (dataframe['invest_exp']  {dependent_feature_value_filter})"
#         ({independent_feature} {independent_feature_value}) & ({dependent_feature} {dependent_feature_value_filter}))"

    filter_condition = f"[(dataframe['{independent_feature}'] {independent_feature_value}) & (dataframe['{dependent_feature}'] {dependent_feature_value_filter})]"   
#     [(df_merged['marit_stat'] =='Married') & (df_merged['house_memb'] >'2')]
    
    print("filter_condition: ", filter_condition)
    
    list_mask = eval(filter_condition)

    
    
    # Apply the filter condition to the DataFrame
#     filtered_df = df_merged.loc[eval(filter_condition)]
    filtered_df = df_merged[list_mask[0]]

    return filtered_df

# Apply all business rules to the DataFrame
filtered_dfs = []
for rule in full_business_rules:
#     print("Rule: ", rule)
    df_filtered = apply_business_rule(rule, df_merged)
    
    # CHECK DISTRIBUTION OF THE NEW INDEPENDENT VARIABLE
    # CHECK DISTRIBUTION OF THE OLD INDEPENDENT VARIABLE
    
    # IF DIFFERENT
        # ADJUST THE NEW ONE TO FOLLOW THE OLD ONE
        
    # DO THE SAME WITH THE DEPENDENT VARIABLE
    
    
    
    
    filtered_dfs.append(df_filtered)

# Concatenate all filtered DataFrames
all_filtered_dfs = pd.concat(filtered_dfs)
df_BR_applied = all_filtered_dfs.drop_duplicates()

# print(final_df)


In [None]:
df_BR_applied

## Save

In [None]:
# df_BR_applied.to_csv("df_BR_applied_v2.csv")

## Check distributions (cat features) and corr matrix (num variables) after Business rules

In [None]:
df_BR_applied = pd.read_csv("df_BR_applied_v2.csv", index_col="Unnamed: 0")

In [None]:
num_df.columns

In [None]:
num_cols = ['age', 'ind_risk', 'income', 'pers_exp', 'house_exp', 'taxes',
       'transp_telecom', 'hobby']

cat_cols = [col for col in df_merged.columns if col not in num_cols]

df_BR_num = df_BR_applied.loc[:, num_cols]
df_BR_cat = df_BR_applied.loc[:, cat_cols]

### Categotical

In [None]:
df_BR_cat["house_memb"].value_counts(normalize=True).sort_index()
# 0.1805,0.3778,0.2387,0.1157,0.0525,0.0238,0.011

In [None]:
categorical_ks_statistics = kolmogorov_smirnov_test(categorical_data=df_BR_cat,
                                                   old_distributions=dists)

### Numerical

In [None]:

for col in df_BR_num.columns:
    plot_distribution(df_BR_num, col)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ... (Code from the previous answer)

# Calculate the correlation matrix for the adjusted DataFrame
correlation_matrix_adjusted = df_BR_num.corr()


# Display the correlation matrix
print("Adjusted Correlation Matrix:")
print(correlation_matrix_adjusted)

# Convert the original correlation data (corr) to a DataFrame
original_corr_df = pd.DataFrame(corr)
original_corr_df.set_index('features', inplace=True)

# Display the original correlation data
print("\nOriginal Correlation Matrix:")
print(original_corr_df)

# Plot the correlation matrix heatmaps for both adjusted and original data side by side
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
fig.suptitle("Comparison of Correlation Matrices", fontsize=16)

# Adjusted correlation matrix heatmap
axes[0].imshow(correlation_matrix_adjusted, cmap='coolwarm', interpolation='nearest')
axes[0].set_xticks(np.arange(len(correlation_matrix_adjusted)))
axes[0].set_yticks(np.arange(len(correlation_matrix_adjusted)))
axes[0].set_xticklabels(correlation_matrix_adjusted.columns, rotation=45)
axes[0].set_yticklabels(correlation_matrix_adjusted.columns)
axes[0].set_title("Adjusted Correlation Matrix")

# Original correlation matrix heatmap
axes[1].imshow(original_corr_df, cmap='coolwarm', interpolation='nearest')
axes[1].set_xticks(np.arange(len(original_corr_df)))
axes[1].set_yticks(np.arange(len(original_corr_df)))
axes[1].set_xticklabels(original_corr_df.columns, rotation=45)
axes[1].set_yticklabels(original_corr_df.columns)
axes[1].set_title("Original Correlation Matrix")

plt.show()


# Feature Engineering

In [None]:
df = pd.read_csv("df_BR_applied_v2.csv", index_col="Unnamed: 0")
# data_sample = df.sample(n=100)
data_sample = df

In [None]:
df["mortgage"].value_counts()

## Data Binning

In [None]:
cat_cols = list(dists.keys())
print("few categorical columns: ", cat_cols[:5])
num_cols = [col for col in df.columns if col not in cat_cols]
print("few numerical columns: ", num_cols[:5])

### Bin the numerical data

In [None]:
data_sample_num = data_sample[num_cols]

In [None]:
data_sample_num

In [None]:
import pandas as pd

# Function to bin a feature based on the provided bins
def bin_feature(df, feature, bins):
    df[feature + '_binned'] = pd.cut(df[feature], bins=bins, labels=False)
    return df

# Binning specifications from the provided table
bins_info = {
    'age': [20, 25, 35, 50, 65, 85, float('inf')],
    'ind_risk': [0, 0.2, 0.4, 0.6, 0.8, 1, float('inf')],
    'income': [0, 6121, 12001, 27601, 43201, 58801, 74401, float('inf')],
    'pers_exp': [0, 4500, 5000, 5500, float('inf')],
    'house_exp': [0, 500, 1500, 3000, float('inf')],
    'taxes': [0, 500, 1000, 2000, 2500, float('inf')],
    'transp_telecom': [0, 500, 1000, 1500, 2500, float('inf')],
    'hobby': [0, 1500, 2000, 3000, float('inf')],
}

# Perform binning for each feature
for feature, bins in bins_info.items():
    num_df_binned = bin_feature(data_sample_num, feature, bins)



In [None]:
data_sample_num.columns

In [None]:

bin_cols = ['age_binned', 'ind_risk_binned',
       'income_binned', 'pers_exp_binned', 'house_exp_binned', 'taxes_binned',
       'transp_telecom_binned', 'hobby_binned']

data_sample_num_bin_only = data_sample_num[bin_cols]
data_sample_num_bin_only

In [None]:
data_sample_num_bin_only["age_binned"].value_counts()

In [None]:
data_sample_num_bin_only["hobby_binned"].value_counts()

In [None]:
data_sample.drop(num_cols, axis=1, inplace=True)

In [None]:
data_sample = pd.concat([data_sample, data_sample_num_bin_only], axis=1)

In [None]:
cat_cols = list(dists.keys())
print("few categorical columns: ", cat_cols[:5])
num_cols = [col for col in data_sample.columns if col not in cat_cols]
print("few numerical columns: ", num_cols[:5])

## Data encoding (categorical)

In [None]:
def data_encoding(data, categorical_cols):
    
    return pd.get_dummies(data, columns=categorical_cols)


In [None]:
encoded_data_sample = data_encoding(data_sample, categorical_cols=cat_cols)

## Feature scaling (data standardization - numerical)

min-max scaling and standardization (z-score normalization).

### NOOO- Split the data into numerical and categorical

In [None]:
def data_split_cat_num(data, numerical_cols):
    
    num_data = data[numerical_cols]
    cat_cols = [col for col in data.columns if col not in numerical_cols]
    cat_data = data[cat_cols]
    
    cat_data.reset_index(inplace=True, drop=True)
    num_data.reset_index(inplace=True, drop=True)
    return cat_data, num_data


In [None]:
encoded_cat_data, encoded_num_data = data_split_cat_num(data=encoded_data_sample, 
                                                       numerical_cols=num_cols)

In [None]:
def data_standardization(data):
    
    from sklearn.preprocessing import StandardScaler
    # Initialize the StandardScaler
    scaler = StandardScaler()

    # Fit and transform the DataFrame to perform standardization
    standardized_df = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)
    
    return standardized_df

test standardization with all features (not just the numerical ones)

In [None]:
encoded_data_sample_standardized_numerical = data_standardization(encoded_num_data)

In [None]:
encoded_cat_data.head()

In [None]:
encoded_data_sample_standardized_numerical.head()

In [None]:
encoded_data_sample_standardized = pd.concat([encoded_cat_data, encoded_data_sample_standardized_numerical],
                                            axis=1)

In [None]:
encoded_data_sample_standardized.shape

## Delete binary cols (columns that have only two possible options as values)

Will keep only the "YES" features (e.g. mortgage_YES - a value of 1 here means the client wants to have a mortgage, 0 - otherwise) to reduce the size of the dataset

In [None]:
def del_NO_cols(data, additional_cols_to_del=['banking_Offline', 'own_rent_house_my own', 'soc_econ_stat_Economically inactive', 'sex_F']):
    
    # select all "_NO" columns 
    cols_to_del = list(data.filter(regex='_NO$').columns)
    
    # and extend the list with additional columns to be deleted
    cols_to_del.extend(additional_cols_to_del)
    
    reduced_data = data.drop(cols_to_del, axis=1)
    
    return reduced_data

In [None]:
encoded_data_sample_reduced = del_NO_cols(encoded_data_sample_standardized)
# encoded_data_sample_reduced = del_NO_cols(encoded_data_sample)

In [None]:
encoded_data_sample_reduced.shape

## NOO Split data - predictors and target

Target features:

    Overdraft
    Consumer credit
    Mortgage loan
    Credit card


In [None]:
encoded_data_sample_reduced.columns

In [None]:
target_columns = ["overdraft_YES", "cons_cred_YES", "mortgage_YES", "bk_cc_YES"]

def split_pred_target(data, target_cols):

    predictor_cols = [col for col in data.columns if col not in target_cols]

    
    X_data = data[predictor_cols]
    y_data = data[target_cols]

    return X_data, y_data



In [None]:
X, y = split_pred_target(data=encoded_data_sample_reduced, target_cols=target_columns)

## Split data on train and test sets

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

##### Save

In [None]:
X_train.to_csv("X_train_v1.csv")
X_test.to_csv("X_test_v1.csv")
y_train.to_csv("y_train_v1.csv")
y_test.to_csv("y_test_v1.csv")

### Test distributions of train vs test data (Categorical) - Kolmogorov-Smirnov

In [None]:
num_cols

#### Generate encoded categorical distributions for the train data

In [None]:
X_train_dists = {}

cat_cols_encoded = [col for col in X.columns if col not in num_cols]

for col in cat_cols_encoded:
    
    X_train_dists[col] = {'labels': list(X_train[col].value_counts().index), 
                          'values':list(X_train[col].value_counts(normalize=True).values)}

In [None]:
# X_train_dists

#### Perform the test

In [None]:
kolmogorov_smirnov_test(X_test[cat_cols_encoded], X_train_dists)

### Numerical data checks

#### Check distributions of train vs test numerical data

In [None]:
for col in X_train[num_cols].columns:
    plot_distribution(X_train[num_cols], col)

In [None]:
for col in X_test[num_cols].columns:
    plot_distribution(X_test[num_cols], col)

####  Compare the corr matrices

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ... (Code from the previous answer)

# Calculate the correlation matrix for the adjusted DataFrame
correlation_matrix_adjusted = X_test[num_cols].corr()


# Display the correlation matrix
print("Adjusted Correlation Matrix:")
print(correlation_matrix_adjusted)

# Convert the original correlation data (corr) to a DataFrame
original_corr_df = pd.DataFrame(X_train[num_cols].corr())
# original_corr_df.set_index('features', inplace=True)

# Display the original correlation data
print("\nOriginal Correlation Matrix:")
print(original_corr_df)

# Plot the correlation matrix heatmaps for both adjusted and original data side by side
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
fig.suptitle("Comparison of Correlation Matrices", fontsize=16)

# Adjusted correlation matrix heatmap
axes[0].imshow(correlation_matrix_adjusted, cmap='coolwarm', interpolation='nearest')
axes[0].set_xticks(np.arange(len(correlation_matrix_adjusted)))
axes[0].set_yticks(np.arange(len(correlation_matrix_adjusted)))
axes[0].set_xticklabels(correlation_matrix_adjusted.columns, rotation=45)
axes[0].set_yticklabels(correlation_matrix_adjusted.columns)
axes[0].set_title("Adjusted Correlation Matrix")

# Original correlation matrix heatmap
axes[1].imshow(original_corr_df, cmap='coolwarm', interpolation='nearest')
axes[1].set_xticks(np.arange(len(original_corr_df)))
axes[1].set_yticks(np.arange(len(original_corr_df)))
axes[1].set_xticklabels(original_corr_df.columns, rotation=45)
axes[1].set_yticklabels(original_corr_df.columns)
axes[1].set_title("Original Correlation Matrix")

plt.show()


## Generate new features

### Generate Non-Linear features

In [None]:
X_train.columns

#### Logarithmic

In [None]:
orig_cols = X_train.columns

In [None]:
for col in orig_cols:
    
    new_col_name = col + "_log"

    X_train[new_col_name] = np.log(X_train[col])
    X_test[new_col_name] = np.log(X_test[col])

In [None]:
X_train.info(verbose=True, show_counts=True)

#### Quadratic

In [None]:
for col in orig_cols:
    
    new_col_name = col + "_x2"

    X_train[new_col_name] = X_train[col]**2
    X_test[new_col_name] = X_test[col]**2

#### Reciprocal

In [None]:
for col in orig_cols:
    
    new_col_name = col + "_1/x"

    X_train[new_col_name] = 1/X_train[col]
    X_test[new_col_name] = 1/X_test[col]

#### Exponential

In [None]:
for col in orig_cols:
    
    new_col_name = col + "_exp"

    X_train[new_col_name] = np.exp(X_train[col])
    X_test[new_col_name] = np.exp(X_test[col])

#### Square rooted

In [None]:
for col in orig_cols:
    
    new_col_name = col + "_sqrt"

    X_train[new_col_name] = np.exp(X_train[col])
    X_test[new_col_name] = np.exp(X_test[col])

In [None]:
X_train.info(verbose=True, show_counts=True)

#### Drop cols with null values

In [None]:
columns_with_null = X_train.columns[X_train.isnull().any()]

# Then, drop the columns with null values
X_train.drop(columns=columns_with_null, axis=1, inplace=True)
X_test.drop(columns=columns_with_null, axis=1, inplace=True)


### Generate custom features (based on domain knowledge)

## Feature selection

In [None]:

y_train_overdraft = y_train["overdraft_YES"]
y_train_cons_cred = y_train["cons_cred_YES"]
y_train_mortgage  = y_train["mortgage_YES"]
y_train_bk_cc     = y_train["bk_cc_YES"]



y_test_overdraft = y_test["overdraft_YES"]
y_test_cons_cred = y_test["cons_cred_YES"]
y_test_mortgage  = y_test["mortgage_YES"]
y_test_bk_cc     = y_test["bk_cc_YES"]

In [None]:
X_train.info()

In [None]:
X_train.info(verbose=True, show_counts=True)

In [None]:
def drop_columns_with_infinity(df):
    # Step 1: Identify columns containing infinity
    cols_with_infinity = df.columns[df.isin([np.inf, -np.inf]).any()]

    # Step 2: Drop the identified columns
    df.drop(columns=cols_with_infinity, inplace=True)

# Example usage:
# Assuming your DataFrame is named 'df'
drop_columns_with_infinity(X_train)
drop_columns_with_infinity(X_test)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
num_features = num_cols
cat_features = [col for col in X_train.columns if col not in num_features]

### Select numerical features (ANOVA)

##### Backup

In [None]:
X_train_backup = X_train
X_test_backup = X_test

In [None]:
from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif, chi2

pd.options.display.float_format = '{:,.5f}'.format

From the chat

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import f_classif

def anova_feature_selection(X, y, significance_level=0.05):
    """
    Perform feature selection using ANOVA and visualize p-values and feature selection status.

    Parameters:
        X (pandas DataFrame): The input feature matrix.
        y (pandas Series or array-like): The target variable.
        significance_level (float): The significance level (default=0.05).

    Returns:
        pandas DataFrame: A DataFrame containing p-values and feature selection status.
    """
    # Perform ANOVA
    f_values, p_values = f_classif(X, y)
    is_selected = p_values < significance_level

    # Create a DataFrame to store the results
    results_df = pd.DataFrame({'Feature': X.columns, 'p-value': p_values, 'Selected': is_selected})

    # Sort the DataFrame based on p-values
    results_df.sort_values(by='p-value', ascending=True, inplace=True)

    # Visualize the p-values and feature selection status
    plt.figure(figsize=(12, 6))
    plt.bar(results_df['Feature'], -np.log10(results_df['p-value']), color=results_df['Selected'].map({True: 'g', False: 'r'}))
    plt.axhline(-np.log10(significance_level), color='b', linestyle='--', label=f'Significance Level ({significance_level})')
    plt.xticks(rotation=90)
    plt.ylabel('-log(p-value)')
    plt.title('ANOVA Feature Selection')
    plt.legend()
    plt.tight_layout()
    plt.show()

    return results_df



import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.feature_selection import f_classif

def anova_feature_selection_with_plotly(X, y, significance_level=0.05):
    """
    Perform feature selection using ANOVA and visualize p-values and feature selection status using Plotly.

    Parameters:
        X (pandas DataFrame): The input feature matrix.
        y (pandas Series or array-like): The target variable.
        significance_level (float): The significance level (default=0.05).

    Returns:
        pandas DataFrame: A DataFrame containing p-values and feature selection status.
    """
    # Perform ANOVA
    f_values, p_values = f_classif(X, y)
    is_selected = p_values < significance_level

    # Create a DataFrame to store the results
    results_df = pd.DataFrame({'Feature': X.columns, 'p-value': p_values, 'Selected': is_selected})

    # Sort the DataFrame based on p-values
    results_df.sort_values(by='p-value', ascending=True, inplace=True)

    # Visualize the p-values and feature selection status using Plotly
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    fig.add_trace(go.Bar(x=results_df['Feature'], y=-np.log10(results_df['p-value']),
                         marker_color=results_df['Selected'].map({True: 'green', False: 'red'}),
                         name='-log(p-value)'))

    fig.add_trace(go.Scatter(x=results_df['Feature'], y=[-np.log10(significance_level)] * len(results_df),
                             mode='lines', line=dict(dash='dash', color='blue'),
                             name=f'Significance Level ({significance_level})'), secondary_y=True)

    fig.update_xaxes(tickangle=90)
    fig.update_yaxes(title='-log(p-value)', secondary_y=False)
    fig.update_yaxes(title='-log(p-value)', secondary_y=True, showgrid=False)
    fig.update_layout(title='ANOVA Feature Selection', legend=dict(x=0, y=1))

    fig.show()

    return results_df


In [None]:
X_train_num = X_train.loc[:, num_features]
X_train_cat = X_train.loc[:, cat_features]


In [None]:
X_train_cat_anova = anova_feature_selection(X=X_train_cat,
                       y=y_train_overdraft,
                       significance_level=0.05)

In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.feature_selection import f_classif

def anova_feature_selection_with_plotly(X, y, significance_level=0.05):
    """
    Perform feature selection using ANOVA and visualize p-values and feature selection status using Plotly.

    Parameters:
        X (pandas DataFrame): The input feature matrix.
        y (pandas Series or array-like): The target variable.
        significance_level (float): The significance level (default=0.05).

    Returns:
        pandas DataFrame: A DataFrame containing p-values and feature selection status.
    """
    # Perform ANOVA
    f_values, p_values = f_classif(X, y)
    is_selected = p_values < significance_level

    # Create a DataFrame to store the results
    results_df = pd.DataFrame({'Feature': X.columns, 'p-value': p_values, 'Selected': is_selected})

    # Sort the DataFrame based on p-values
    results_df.sort_values(by='p-value', ascending=True, inplace=True)

    # Shorten the feature names for better visualization
    max_label_length = 15  # Adjust the maximum label length as needed
    results_df['Shortened_Feature'] = results_df['Feature'].apply(lambda x: x[:max_label_length] + '...' if len(x) > max_label_length else x)

    # Visualize the p-values and feature selection status using Plotly
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    fig.add_trace(go.Bar(x=results_df['Shortened_Feature'], y=-np.log10(results_df['p-value']),
                         marker_color=results_df['Selected'].map({True: 'green', False: 'red'}),
                         name='-log(p-value)'))

    fig.add_trace(go.Scatter(x=results_df['Shortened_Feature'],
                             y=[-np.log10(significance_level)] * len(results_df),
                             mode='lines', line=dict(dash='dash', color='blue'),
                             name=f'Significance Level ({significance_level})'), secondary_y=True)

    fig.update_xaxes(tickangle=45)  # Rotate the x-axis labels by 45 degrees
    fig.update_yaxes(title='-log(p-value)', secondary_y=False)
    fig.update_yaxes(title='-log(p-value)', secondary_y=True, showgrid=False)
    fig.update_layout(title='ANOVA Feature Selection', legend=dict(x=0, y=1), height=600)  # Increase the height

    fig.show()

    return results_df


In [None]:
# X_train_cat_anova_plotly = anova_feature_selection_with_plotly(X=X_train_cat,
#                        y=y_train_overdraft,
#                        significance_level=0.05)

In [None]:
X_train_cat_anova

In [None]:
X_train_cat_sel_cols = X_train_cat_anova.loc[X_train_cat_anova["Selected"] == True]["Feature"]

In [None]:
list(X_train_cat_sel_cols)

### Select categorical features (Chi Squared)

In [None]:
# import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# from sklearn.feature_selection import chi2

# def chi_squared_feature_selection(X, y, significance_level=0.05):
#     """
#     Perform feature selection using chi-squared test and visualize p-values and feature selection status.

#     Parameters:
#         X (pandas DataFrame): The input feature matrix.
#         y (pandas Series or array-like): The target variable.
#         significance_level (float): The significance level (default=0.05).

#     Returns:
#         pandas DataFrame: A DataFrame containing p-values and feature selection status.
#     """
#     # Perform chi-squared test
#     chi2_values, p_values = chi2(X, y)
#     is_selected = p_values < significance_level

#     # Create a DataFrame to store the results
#     results_df = pd.DataFrame({'Feature': X.columns, 'p-value': p_values, 'Selected': is_selected})

#     # Sort the DataFrame based on p-values
#     results_df.sort_values(by='p-value', ascending=True, inplace=True)

#     # Visualize the p-values and feature selection status
#     plt.figure(figsize=(12, 6))
#     plt.bar(results_df['Feature'], -np.log10(results_df['p-value']), color=results_df['Selected'].map({True: 'g', False: 'r'}))
#     plt.axhline(-np.log10(significance_level), color='b', linestyle='--', label=f'Significance Level ({significance_level})')
#     plt.xticks(rotation=90)
#     plt.ylabel('-log(p-value)')
#     plt.title('Chi-Squared Feature Selection')
#     plt.legend()
#     plt.tight_layout()
#     plt.show()

#     return results_df


In [None]:
# X_train_chi_cols = chi_squared_feature_selection(X=X_train_num,
#                        y=y_train_overdraft,
#                        significance_level=0.05)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

def pearson_correlation_feature_selection(X, y, significance_level=0.05):
    """
    Perform feature selection using Pearson correlation coefficient and visualize correlations.

    Parameters:
        X (pandas DataFrame): The input feature matrix (numerical features only).
        y (pandas Series or array-like): The target variable (numerical).
        significance_level (float): The significance level for p-values (default=0.05).

    Returns:
        pandas DataFrame: A DataFrame containing Pearson correlation coefficients and p-values.
    """
    # Compute Pearson correlation coefficients and p-values
    corr_coeffs, p_values = np.abs(np.corrcoef(X, y, rowvar=False)[-1, :-1]), []
    for i in range(X.shape[1]):
        _, p_value = pearsonr(X.iloc[:, i], y)
        p_values.append(p_value)

    # Create a DataFrame to store the results
    results_df = pd.DataFrame({'Feature': X.columns, 'Correlation Coefficient': corr_coeffs, 'p-value': p_values})

    # Sort the DataFrame based on correlation coefficients
    results_df.sort_values(by='Correlation Coefficient', ascending=False, inplace=True)

    # Visualize the correlations
    plt.figure(figsize=(12, 6))
    plt.bar(results_df['Feature'], results_df['Correlation Coefficient'])
    plt.axhline(significance_level, color='red', linestyle='--', label=f'Significance Level ({significance_level})')
    plt.xticks(rotation=90)
    plt.ylabel('Correlation Coefficient (absolute)')
    plt.title('Pearson Correlation Feature Selection')
    plt.legend()
    plt.tight_layout()
    plt.show()

    return results_df


In [None]:
X_train_pears_cols = pearson_correlation_feature_selection(X=X_train_num,
               y=y_train_overdraft,
                                                       significance_level=0.05)

In [None]:
X_train_pears_cols

In [None]:
all_st_fs = []
all_st_fs.extend(list(X_train_cat_sel_cols))
all_st_fs.extend(list(num_features))

In [None]:
all_st_fs

###  NOOO Combine num and cat selected features

In [None]:
features_selected = [col for col in X_train.columns if col in all_st_fs]

In [None]:
X_train_fs, X_test_fs = X_train[features_selected], X_test[features_selected]


## Balance the dependent variable of the training data

Multiple ways to do that:
- SMOTE
- **TODO** RandomOverSampler with correlation-aware sampling (ROS-CAS)


Will split y_train into four different y_trains for each category. The idea is that we'll have four separate models in the end that are going to make predictions for each class separately.

### SMOTE

In [None]:
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from collections import Counter


y_train_overdraft = y_train["overdraft_YES"]
y_train_cons_cred = y_train["cons_cred_YES"]
y_train_mortgage  = y_train["mortgage_YES"]
y_train_bk_cc     = y_train["bk_cc_YES"]



y_test_overdraft = y_test["overdraft_YES"]
y_test_cons_cred = y_test["cons_cred_YES"]
y_test_mortgage  = y_test["mortgage_YES"]
y_test_bk_cc     = y_test["bk_cc_YES"]


In [None]:
y_train_overdraft.name

In [None]:
def oversample_with_SMOTE(train_X, train_y):
    

    os = SMOTE(random_state=42)

    os_X_tr, os_y_tr = os.fit_resample(train_X, train_y)
    # TODO ...the rest to follow later

    df_os_X_tr = pd.DataFrame(data=os_X_tr ,columns=train_X.columns)
    df_os_y_tr = pd.DataFrame(data=os_y_tr, columns=[train_y.name])


    # check old and new distributions:
    print("Original data target distributions:")
    print(train_y.value_counts())
    print()
    print()
    print("Oversampled data target distributions:")
    print(df_os_y_tr.value_counts())
    
    
    return df_os_X_tr, df_os_y_tr

In [None]:
df_os_X_train_overdraft, df_os_y_train_overdraft = oversample_with_SMOTE(train_X=X_train, train_y=y_train_overdraft)

In [None]:
y_train_overdraft_ravel = df_os_y_train_overdraft.values.ravel()


In [None]:
X_train_fs_overdraft_OS, df_os_y_train_overdraft = oversample_with_SMOTE(train_X=X_train_fs, train_y=y_train_overdraft)

#### NOO Save training data

In [None]:
# df_os_X_train_overdraft.to_csv("df_os_X_train_overdraft_v1.csv")
# pd.Series(y_train_overdraft_ravel, name='overdraft').to_csv("y_train_overdraft_ravel_v1.csv")

# Modeling 

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import make_scorer, recall_score, auc
from sklearn.model_selection import cross_validate, KFold


In [None]:
def model_cv_performance_measure(model, X, y, cv):


    # Define the scoring metric you want to use (in this case, recall)
    scorer = make_scorer(recall_score, average='binary')
    # scorer = make_scorer(auc)


    # Perform cross-validation and compute recall
    recall_scores = cross_val_score(model, X, y, cv=cv, scoring=scorer)

    # The recall scores for each fold will be stored in recall_scores
    print("Recall scores for each fold:", recall_scores)

    # The average recall score across all folds
    print("Min recall:", recall_scores.min())

    model.fit(X, y)
    print()
    y_pred = model.predict(X)# performance
    print(f'Accuracy Score: {accuracy_score(y,y_pred)}')
    print(f'Confusion Matrix: \n{confusion_matrix(y, y_pred)}')
    print(f'Area Under Curve: {roc_auc_score(y, y_pred)}')
    print(f'Recall score: {recall_score(y,y_pred)}')


In [None]:
def model_test_performance_measure(model, train_X, train_y, test_X, test_y, cv):
    
    model.fit(train_X, train_y)
    print()
    y_pred = model.predict(test_X)# performance
    print(f'Accuracy Score: {accuracy_score(test_y,y_pred)}')
    print(f'Confusion Matrix: \n{confusion_matrix(test_y, y_pred)}')
    print(f'Area Under Curve: {roc_auc_score(test_y, y_pred)}')
    print(f'Recall score: {recall_score(test_y,y_pred)}')

In [None]:
def test_model_cv_fit(model, train_X, train_y, test_X, test_y, cv, model_idx):
    
    from sklearn.model_selection import cross_validate, KFold

    # Define the scoring metric you want to use (in this case, recall)
    scorer = make_scorer(recall_score, average='binary')

    # Perform cross-validation with the custom scorer
    cv_results = cross_validate(estimator=model, 
                                X=df_os_X_train_overdraft,
                                y=y_train_overdraft_ravel,
                                cv=5, 
                                scoring=scorer,
                                return_estimator=True,
                                return_indices=True,
                                return_train_score=True)
    
    
    print(f"cv_results train folds: {cv_results['train_score']}")
    print(f"cv_results test folds: {cv_results['test_score']}")
    train_indices = cv_results["indices"]["train"][model_idx]


    best_clf = cv_results["estimator"][model_idx]
    best_clf.fit(train_X.loc[train_indices], pd.DataFrame(train_y).loc[train_indices])

    y_pred_train = best_clf.predict(train_X)# performance
    print(f'Training statistics')
    print(f'Accuracy Score: {accuracy_score(train_y,y_pred_train)}')
    print(f'Confusion Matrix: \n{confusion_matrix(train_y, y_pred_train)}')
    print(f'Area Under Curve: {roc_auc_score(train_y, y_pred_train)}')
    print(f'Recall score: {recall_score(train_y,y_pred_train)}')
    print()
    
    y_pred_test = best_clf.predict(test_X)# performance
    print(f'Test statistics')
    print(f'Accuracy Score: {accuracy_score(test_y,y_pred_test)}')
    print(f'Confusion Matrix: \n{confusion_matrix(test_y, y_pred_test)}')
    print(f'Area Under Curve: {roc_auc_score(test_y, y_pred_test)}')
    print(f'Recall score: {recall_score(test_y,y_pred_test)}')   


In [None]:
# def model_predict(model, train_X, train_y, test_X, test_y):
    
#     from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, recall_score
    
#     # extract values
#     train_y_values = train_y.values.flatten() 
    
#     # fit it
#     model.fit(train_X, train_y_values)
    
#     # test
#     pred_y = model.predict(test_X)# performance
#     probas_y = model.predict_proba(test_X)[:, 1]
#     print(f'Accuracy Score: {accuracy_score(test_y, pred_y)}')
#     print(f'Confusion Matrix: \n{confusion_matrix(test_y, pred_y)}')
#     print(f'Area Under Curve: {roc_auc_score(test_y, pred_y)}')
#     print(f'Recall score: {recall_score(test_y, pred_y)}')
    
#     return pred_y, probas_y

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lg1 = LogisticRegression(random_state=42, max_iter=1000, class_weight={0: 0.22, 1: 0.78})

for i in range(5):
    
    print(f"model_fold: {i}")
    test_model_cv_fit(model=lg1,
                     train_X=df_os_X_train_overdraft,
                     train_y=y_train_overdraft_ravel,
                     test_X=X_test,
                     test_y=y_test_overdraft,
                     cv=5,
                     model_idx=i)
    
    print()
    print()



### with Features Selected

In [None]:
from sklearn.linear_model import LogisticRegression
lg1 = LogisticRegression(random_state=42, max_iter=1000, class_weight={0: 0.22, 1: 0.78})

for i in range(5):
    
    print(f"model_fold: {i}")
    test_model_cv_fit(model=lg1,
                     train_X=X_train_fs_overdraft_OS,
                     train_y=y_train_overdraft_ravel,
                     test_X=X_test_fs,
                     test_y=y_test_overdraft,
                     cv=5,
                     model_idx=i)
    
    print()
    print()



In [None]:
# pred_y_overdraft, probas_y_overdraft = model_predict(model=lg1,
#                                 train_X=df_os_X_train_overdraft,
#                                 train_y=df_os_y_train_overdraft,
#                                 test_X=X_test,
#                                 test_y=y_test_overdraft)

In [None]:
y_train_overdraft.value_counts(normalize=True)

In [None]:
y_test_overdraft.value_counts(normalize=True)

## Random Forest

### Cross validation (extract the best model and fit/predict the data with it)

In [None]:
cl_1 = 0.5
rf_clf = RandomForestClassifier(n_estimators=100, 
                                class_weight={0: 1-cl_1, 1: cl_1},
                                max_depth=5, 
                                min_samples_leaf=100, 
                                max_leaf_nodes=20)
for i in range(5):
    
    print(f"model_fold: {i}")
    test_model_cv_fit(model=rf_clf,
                     train_X=df_os_X_train_overdraft,
                     train_y=y_train_overdraft_ravel,
                     test_X=X_test,
                     test_y=y_test_overdraft,
                     cv=5,
                     model_idx=i)
    
    print()
    print()


### with Feature Selection

In [None]:


cl_1 = 0.5
rf_clf = RandomForestClassifier(n_estimators=100, 
                                class_weight={0: 1-cl_1, 1: cl_1},
                                max_depth=5, 
                                min_samples_leaf=100, 
                                max_leaf_nodes=20)
for i in range(5):
    
    print(f"model_fold: {i}")
    test_model_cv_fit(model=rf_clf,
                     train_X=X_train_fs_overdraft_OS,
                     train_y=y_train_overdraft_ravel,
                     test_X=X_test_fs,
                     test_y=y_test_overdraft,
                     cv=5,
                     model_idx=i)
    
    print()
    print()


- n_estimators: 100 (default)
- max_depth: 5 (around 5) is ok
- min_samples_leaf: min 100, could be much more (500)
- max_leaf_nodes: around 20

TODO: cross validation vs fit; finish the pipeline - to have the models; each model to have a grid (not more than 3-4 params with 3-4 values); run the hypergrid; class weights??? download kaggle dataset, load it with this code and if the model is shit the model is the problem, otherwise the data is the problem.

## Decision Tree

In [None]:
# define model
tree_clf = DecisionTreeClassifier()

for i in range(5):
    
    print(f"model_fold: {i}")
    test_model_cv_fit(model=tree_clf,
                     train_X=df_os_X_train_overdraft,
                     train_y=y_train_overdraft_ravel,
                     test_X=X_test,
                     test_y=y_test_overdraft,
                     cv=5,
                     model_idx=i)
    
    print()
    print()

### with Feature Selection

In [None]:
tree_clf = DecisionTreeClassifier()

for i in range(5):
    
    print(f"model_fold: {i}")
    test_model_cv_fit(model=tree_clf,
                     train_X=X_train_fs_overdraft_OS,
                     train_y=y_train_overdraft_ravel,
                     test_X=X_test_fs,
                     test_y=y_test_overdraft,
                     cv=5,
                     model_idx=i)
    
    print()
    print()

## XgBoost

In [None]:
# create model instance
xgboost_clf = XGBClassifier()


for i in range(5):
    
    print(f"model_fold: {i}")
    test_model_cv_fit(model=tree_clf,
                     train_X=df_os_X_train_overdraft,
                     train_y=y_train_overdraft_ravel,
                     test_X=X_test,
                     test_y=y_test_overdraft,
                     cv=5,
                     model_idx=i)
    
    print()
    print()

### with Feature Selection

In [None]:
# create model instance
xgboost_clf = XGBClassifier()


for i in range(5):
    
    print(f"model_fold: {i}")
    test_model_cv_fit(model=tree_clf,
                     train_X=X_train_fs_overdraft_OS,
                     train_y=y_train_overdraft_ravel,
                     test_X=X_test_fs,
                     test_y=y_test_overdraft,
                     cv=5,
                     model_idx=i)
    
    print()
    print()

# Hyperparameter tuning

In [None]:
# logistic_regression_search_space = {
#     'C': [0.001, 0.01, 0.1, 1, 10, 100],
#     'penalty': ['l1', 'l2'],
#     'solver': ['liblinear', 'lbfgs', 'saga']
# }

# decision_tree_search_space = {
#     'criterion': ['gini', 'entropy'],
#     'max_depth': [None, 5, 10, 20, 30],
#     'min_samples_split': [2, 5, 10, 20],
#     'min_samples_leaf': [1, 2, 4, 8]
# }

# # random forest
# #     n_estimators: 100 (default)
# #     max_depth: 5 (around 5) is ok
# #     min_samples_leaf: min 100, could be much more (500)
# #     max_leaf_nodes: around 20

# random_forest_search_space = {
#     'n_estimators': [50, 100, 200, 300],
#     'criterion': ['gini', 'entropy'],
#     'max_depth': [None, 5, 7, 8, 10],
#     'min_samples_leaf': [100, 200, 400, 500, 787],
#     'max_leaf_nodes': [10, 20, 50, 100, 200]
# }

# xgboost_search_space = {
#     'n_estimators': [50, 100, 500, 1000],
#     'max_depth': [13],
#     'min_child_weight': [1, 5, 10],
#     'subsample': [0.8, 0.9, 1.0],
#     'colsample_bytree': [0.8, 0.9, 1.0]
# }


In [None]:
# grid_results_dict = {}

# def grid_search(X, y, models_list, space_list, scoring, n_jobs, cv, save_dict):
    
#     for model, space in zip(models_list, space_list):
        
#         model_name = str(model)

#         search = GridSearchCV(model, space, scoring, n_jobs=n_jobs, cv=cv)
#         # execute search
#         result = search.fit(X, y)
#         # summarize result
#         print('Best Score: %s' % result.best_score_)
#         print('Best Hyperparameters: %s' % result.best_params_)

#         save_dict[model_name] = {'best_score_':result.best_score_,
#                                 'best_params_':result.best_params_}
        

In [None]:
# grid_search(X=df_os_X_train_overdraft,
#            y=y_train_overdraft_ravel,
#            models_list=[lg1, tree_clf, rf_clf, xgboost_clf],
#            space_list=[logistic_regression_search_space, decision_tree_search_space, random_forest_search_space, xgboost_search_space],
#            scoring='recall',
#            n_jobs=-1,
#            cv=5,
#            save_dict=grid_results_dict)

xgboost: 
- n_estimators: around 500
- max_depth: 13 (square root of num of features)
- subsample - ok

Best Score: 0.6737725686809087
Best Hyperparameters: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.7464558296489499
Best Hyperparameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2}
Best Score: 0.7891783938266397
Best Hyperparameters: {'criterion': 'gini', 'max_depth': None, 'max_leaf_nodes': 200, 'min_samples_leaf': 100, 'n_estimators': 300}

## Save training data

### Analyse feature importances

#### Feature importances

In [None]:
rf_clf.feature_importances_[:10]

In [None]:
# rf_feat_importances = pd.Series(rf_clf.feature_importances_, index=df_os_X_train_overdraft.columns)
# rf_feat_importances.nlargest(10).plot(kind='barh')

In [None]:
# rf_best_feat_importances = list(rf_feat_importances.nlargest(10).index)
# rf_best_feat_importances

# Modeling after Hyperparam tuning

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lg1 = LogisticRegression(C=100,
                         penalty='l2',
                         solver='lbfgs',
                         random_state=42,
                         max_iter=1000,
                         class_weight={0: 0.22, 1: 0.78})

for i in range(5):
    
    print(f"model_fold: {i}")
    test_model_cv_fit(model=lg1,
                     train_X=df_os_X_train_overdraft,
                     train_y=y_train_overdraft_ravel,
                     test_X=X_test,
                     test_y=y_test_overdraft,
                     cv=5,
                     model_idx=i)
    
    print()
    print()



## Decision Trees

In [None]:
# define model
tree_clf = DecisionTreeClassifier(criterion='gini', 
                                  max_depth=None,
                                  min_samples_leaf=2,
                                  min_samples_split=2)

for i in range(5):
    
    print(f"model_fold: {i}")
    test_model_cv_fit(model=tree_clf,
                     train_X=df_os_X_train_overdraft,
                     train_y=y_train_overdraft_ravel,
                     test_X=X_test,
                     test_y=y_test_overdraft,
                     cv=5,
                     model_idx=i)
    
    print()
    print()

## Random Forests

In [None]:
cl_1 = 0.5
rf_clf = RandomForestClassifier(class_weight={0: 1-cl_1, 1: cl_1},
                                criterion='gini',
                                max_depth=None,
                                max_leaf_nodes=200,
                                min_samples_leaf=100,
                                n_estimators=300)
for i in range(5):
    
    print(f"model_fold: {i}")
    test_model_cv_fit(model=rf_clf,
                     train_X=df_os_X_train_overdraft,
                     train_y=y_train_overdraft_ravel,
                     test_X=X_test,
                     test_y=y_test_overdraft,
                     cv=5,
                     model_idx=i)
    
    print()
    print()


## xgBoost

In [None]:
# create model instance
xgboost_clf = XGBClassifier(n_estimators=500,
                            max_depth=13,
                            subsample=0.8,
                            colsample_bytree=0.9)


for i in range(5):
    
    print(f"model_fold: {i}")
    test_model_cv_fit(model=tree_clf,
                     train_X=df_os_X_train_overdraft,
                     train_y=y_train_overdraft_ravel,
                     test_X=X_test,
                     test_y=y_test_overdraft,
                     cv=5,
                     model_idx=i)
    
    print()
    print()

# Conclusion and Next steps

**1. Data Generation**
- We didn't generate our data based on the pers_exp column (which should've served as our base column for generating the rest of the numerical data)
- We didn't manage to generate the data using Cholesky - either we had a good normal distributions with bad correlation matrices, or good correlation matrices with bad values (not complying with lower and upper boundaries, means, etc.)
- A question remains whether the business rules are sufficient to guarantee proper logical relations in the data.
- Apply business rules on an iterative approach (try different techniques to fill in the data)

**2. Feature Engineering**
- Wanted to try Genetic Algorithms for Feature Selection - didn't work out so far, but it's a TODO
- Binning!
- Check interaction effects between features (Dani)

**3. Modeling**
- run the models on another similar dataset 


# Run the models on another data

# Alternative tests

## Generate pers_exp

In [None]:
import numpy as np
from scipy.stats import truncnorm

# Define the given distribution and its corresponding bins
distribution = [0.10, 0.44, 0.33, 0.14]
bins = [4500, 5000, 5500, 6000]

# Define the desired mean and standard deviation
mean = 5305.00
std_dev = 500  # You can adjust this value to control the spread of the data

# Generate random values following a truncated normal distribution
size_of_data = 1000
a, b = (bins[0] - mean) / std_dev, (bins[-1] - mean) / std_dev
truncated_data = truncnorm.rvs(a, b, loc=mean, scale=std_dev, size=size_of_data)

# Clip the data to fit within the specified bins
pers_exp_data = np.clip(truncated_data, bins[0], bins[-1])

In [None]:

# Now you have the synthetic_data containing random values generated from the given distribution with the specified mean.
# Plot the real-world data and the synthetic data
plt.hist(pers_exp_data, bins=50, alpha=0.5, label='Synthetic data')
plt.legend()
plt.show()


## Generate the rest of the numerical features

In [None]:
def to_uniform(data):
    # Convert data to uniform distribution using CDF
    sorted_data = np.sort(data)
    ranks = np.argsort(data)
    u = (ranks - 0.5) / len(data)
    return u

def from_uniform(u, original_data):
    sorted_data = np.sort(original_data)
    ranks = (u * (len(original_data) - 1)).astype(int)
    return np.interp(u, np.linspace(0, 1, len(original_data)), sorted_data)

# Rest of the code remains the same



In [None]:
import numpy as np
import pandas as pd
from scipy.stats import truncnorm

# Given correlation matrix
corr = {
    'features': ['age', 'ind_risk', 'income', 'pers_exp', 'house_exp', 'taxes', 'transp_telecom', 'hobby'],
    'age': [1, -0.00665947056405372, 0.00291644965339247, 0.0107779942638097, 0.00698674581731255, 0.00729153655132963, 0.0099866509330216, 0.00931630696561133],
    'ind_risk': [-0.00665947056405372, 1, 0.0039918072709289, 0.00806259039194059, 0.00457023635440603, 0.0061985340641631, 0.00768699810849585, -0.00332322616613201],
    'income': [0.00291644965339247, 0.0039918072709289, 1, 0.560949334881676, 0.58892666343229, 0.581907424628933, 0.562946509689962, 0.352350802339294],
    'pers_exp': [0.0107779942638097, 0.00806259039194059, 0.560949334881676, 1, 0.928449923861951, 0.929598634668897, 0.934775947642248, 0.714298364869941],
    'house_exp': [0.00698674581731255, 0.00457023635440603, 0.58892666343229, 0.928449923861951, 1, 0.93031279279417, 0.927846735467478, 0.679286362990223],
    'taxes': [0.00729153655132963, 0.0061985340641631, 0.581907424628933, 0.929598634668897, 0.93031279279417, 1, 0.92920510128812, 0.689442053350162],
    'transp_telecom': [0.0099866509330216, 0.00768699810849585, 0.562946509689962, 0.934775947642248, 0.927846735467478, 0.92920510128812, 1, 0.714114127908189],
    'hobby': [0.00931630696561133, -0.00332322616613201, 0.352350802339294, 0.714298364869941, 0.679286362990223, 0.689442053350162, 0.714114127908189, 1]
}

# DataFrame of means
means = pd.DataFrame([50.58, 0.6039, 13579.30, 5305.00, 2260, 1500, 1335, 1740],
                     columns=['mean'])
means.index = ['age', 'ind_risk', 'income', 'pers_exp', 'house_exp', 'taxes', 'transp_telecom', 'hobby']

# Define the given distribution and its corresponding bins for pers_exp
distribution = [0.10, 0.44, 0.33, 0.14]
bins = [4500, 5000, 5500, 6000]
mean = 5305.00
std_dev = 500
size_of_data = 1000
a, b = (bins[0] - mean) / std_dev, (bins[-1] - mean) / std_dev

# Generate random values following a truncated normal distribution
truncated_data = truncnorm.rvs(a, b, loc=mean, scale=std_dev, size=size_of_data)

# Clip the data to fit within the specified bins
pers_exp_data = np.clip(truncated_data, bins[0], bins[-1])

# Transform "pers_exp_data" to uniform distribution
u_pers_exp = to_uniform(pers_exp_data)

# Generate random uncorrelated multivariate normal data
random_data = np.random.normal(size=(size_of_data, len(means)))

# Use eigendecomposition to introduce the desired correlation matrix
corr_matrix = pd.DataFrame.from_dict(corr).drop('features', axis=1).values
eigenvalues, eigenvectors = np.linalg.eig(corr_matrix)
correlated_data = random_data @ np.sqrt(np.diag(eigenvalues)) @ eigenvectors.T

# Scale and shift the data to match the desired mean and standard deviation
scaled_data = correlated_data * std_dev + mean

# Apply inverse transformation to the original distribution for all features
synthetic_data = pd.DataFrame(data=from_uniform(scaled_data, u_pers_exp),
                              columns=means.index)

print(synthetic_data.head())


In [None]:
import numpy as np
import pandas as pd
from scipy.stats import truncnorm

# Function to generate age data
def generate_age_data(size_of_data):
    return np.random.randint(20, 86, size=size_of_data)

# Function to generate individual risk preference data
def generate_ind_risk_data(size_of_data):
    return np.random.rand(size_of_data)

# Function to generate income data
def generate_income_data(size_of_data):
    income_bins = [0, 6121, 12001, 27601, 43201, 58801, 74401, np.inf]
    return np.random.choice(income_bins[:-1], size=size_of_data) + np.random.rand(size_of_data) * (income_bins[1] - income_bins[0])

# Function to generate personal expenses data
def generate_pers_exp_data(size_of_data):
    pers_exp_bins = [0, 4500, 5000, 5500, 6000]
    return np.random.choice(pers_exp_bins[:-1], size=size_of_data) + np.random.rand(size_of_data) * (pers_exp_bins[1] - pers_exp_bins[0])

# Function to generate housing costs data
def generate_house_exp_data(size_of_data):
    house_exp_bins = [0, 500, 1500, 3000, 4000]
    return np.random.choice(house_exp_bins[:-1], size=size_of_data) + np.random.rand(size_of_data) * (house_exp_bins[1] - house_exp_bins[0])

# Function to generate taxes and insurance data
def generate_taxes_data(size_of_data):
    taxes_bins = [0, 500, 1000, 2000, 2500]
    return np.random.choice(taxes_bins[:-1], size=size_of_data) + np.random.rand(size_of_data) * (taxes_bins[1] - taxes_bins[0])

# Function to generate transport and communications data
def generate_transp_telecom_data(size_of_data):
    transp_telecom_bins = [0, 500, 1000, 1500, 2500]
    return np.random.choice(transp_telecom_bins[:-1], size=size_of_data) + np.random.rand(size_of_data) * (transp_telecom_bins[1] - transp_telecom_bins[0])

# Function to generate leisure and hobby data
def generate_hobby_data(size_of_data):
    hobby_bins = [0, 1500, 2000, 3000]
    return np.random.choice(hobby_bins[:-1], size=size_of_data) + np.random.rand(size_of_data) * (hobby_bins[1] - hobby_bins[0])
import numpy as np
import pandas as pd
from scipy.stats import truncnorm

# ... (Same functions to generate data for each feature as provided in the previous response)
import numpy as np
import pandas as pd
from scipy.stats import truncnorm

# ... (Same functions to generate data for each feature as provided in the previous response)


In [None]:
import numpy as np
import pandas as pd
from scipy.stats import truncnorm

# ... (Same functions to generate data for each feature as provided in the previous response)

# DataFrame of means
means = pd.DataFrame([50.58, 0.6039, 13579.30, 5305.00, 2260, 1500, 1335, 1740],
                     columns=['mean'])
means.index = ['age', 'ind_risk', 'income', 'pers_exp', 'house_exp', 'taxes', 'transp_telecom', 'hobby']

# Define the given distribution and its corresponding bins for pers_exp
distribution = [0.10, 0.44, 0.33, 0.14]
bins = [4500, 5000, 5500, 6000]
mean = 5305.00
std_dev = 500
size_of_data = 1000
a, b = (bins[0] - mean) / std_dev, (bins[-1] - mean) / std_dev

# Generate random values following a standard normal distribution
random_data = np.random.normal(size=(size_of_data, len(means)))

# Define the desired correlation matrix
corr_matrix = np.array([
    [1.0, 0.8, 0.3, 0.2, 0.1, 0.1, 0.05, 0.01],
    [0.8, 1.0, 0.5, 0.3, 0.2, 0.1, 0.05, 0.01],
    [0.3, 0.5, 1.0, 0.5, 0.3, 0.2, 0.1, 0.05],
    [0.2, 0.3, 0.5, 1.0, 0.8, 0.5, 0.3, 0.2],
    [0.1, 0.2, 0.3, 0.8, 1.0, 0.8, 0.5, 0.3],
    [0.1, 0.1, 0.2, 0.5, 0.8, 1.0, 0.8, 0.5],
    [0.05, 0.05, 0.1, 0.3, 0.5, 0.8, 1.0, 0.8],
    [0.01, 0.01, 0.05, 0.2, 0.3, 0.5, 0.8, 1.0]
])

# Use Cholesky decomposition to introduce the desired correlation matrix
chol_decomp = np.linalg.cholesky(corr_matrix)

# Generate correlated data using Cholesky decomposition
correlated_data = random_data @ chol_decomp.T

# Apply inverse transformation to the original distribution for pers_exp
pers_exp_data = generate_pers_exp_data(size_of_data)
sorted_pers_exp = np.sort(pers_exp_data)
ranks = np.clip(correlated_data[:, 3], 0, 1) * (len(sorted_pers_exp) - 1)
ranks = ranks.astype(int)
synthetic_pers_exp = sorted_pers_exp[ranks]

# Apply inverse transformation to the original distribution for other features
synthetic_data = pd.DataFrame()
synthetic_data['age'] = from_uniform(correlated_data[:, 0], age_data)
synthetic_data['ind_risk'] = from_uniform(correlated_data[:, 1], ind_risk_data)
synthetic_data['income'] = from_uniform(correlated_data[:, 2], income_data)
synthetic_data['pers_exp'] = synthetic_pers_exp
synthetic_data['house_exp'] = from_uniform(correlated_data[:, 4], house_exp_data)
synthetic_data['taxes'] = from_uniform(correlated_data[:, 5], taxes_data)
synthetic_data['transp_telecom'] = from_uniform(correlated_data[:, 6], transp_telecom_data)
synthetic_data['hobby'] = from_uniform(correlated_data[:, 7], hobby_data)

print(synthetic_data.corr())


In [None]:
import numpy as np
import pandas as pd
from vinecopulib import VineCopula

# Define the number of samples
size_of_data = 1000

# Define the desired correlation matrix
corr_matrix = np.array([
    [1.0, 0.8, 0.3, 0.2, 0.1, 0.1, 0.05, 0.01],
    [0.8, 1.0, 0.5, 0.3, 0.2, 0.1, 0.05, 0.01],
    [0.3, 0.5, 1.0, 0.5, 0.3, 0.2, 0.1, 0.05],
    [0.2, 0.3, 0.5, 1.0, 0.8, 0.5, 0.3, 0.2],
    [0.1, 0.2, 0.3, 0.8, 1.0, 0.8, 0.5, 0.3],
    [0.1, 0.1, 0.2, 0.5, 0.8, 1.0, 0.8, 0.5],
    [0.05, 0.05, 0.1, 0.3, 0.5, 0.8, 1.0, 0.8],
    [0.01, 0.01, 0.05, 0.2, 0.3, 0.5, 0.8, 1.0]
])

# Define the means and standard deviations for each feature
means = pd.Series({
    'age': 52.0,
    'ind_risk': 0.5,
    'income': 45000.0,
    'pers_exp': 3500.0,
    'house_exp': 2500.0,
    'taxes': 1500.0,
    'transp_telecom': 1250.0,
    'hobby': 750.0
})

std_devs = pd.Series({
    'age': 10.0,
    'ind_risk': 0.2,
    'income': 15000.0,
    'pers_exp': 500.0,
    'house_exp': 500.0,
    'taxes': 200.0,
    'transp_telecom': 200.0,
    'hobby': 300.0
})

# Generate uniform data for VineCopula
uniform_data = np.random.rand(size_of_data, len(means))

# Apply inverse transformation to map the uniform data to the desired distributions
for column_name in means.index:
    bins = np.linspace(means[column_name] - 3 * std_devs[column_name],
                       means[column_name] + 3 * std_devs[column_name], 100)
    synthetic_data[column_name] = pd.cut(uniform_data[:, i], bins, labels=bins[:-1]).astype(float)

# Calculate the correlation matrix of the synthetic data
corr_matrix_synthetic = synthetic_data.corr()

# Use VineCopula to capture the desired correlation structure
copula = VineCopula(copula_order=2, family_set=[0, 1, 2, 3])
copula.fit(synthetic_data)

# Generate correlated data using VineCopula
correlated_data = copula.sample(size_of_data)

# Scale and shift the data to match the desired mean and standard deviation
scaled_data = correlated_data * std_devs.values + means.values

# Create the final synthetic dataset
synthetic_data = pd.DataFrame(data=scaled_data, columns=means.index)

# Print the correlation matrix of the synthetic data
print(synthetic_data.corr())


In [None]:
for col in synthetic_data.columns:
    plot_distribution(synthetic_data, col)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ... (Code from the previous answer)

# Calculate the correlation matrix for the adjusted DataFrame
correlation_matrix_adjusted = synthetic_data.corr()


# Display the correlation matrix
print("Adjusted Correlation Matrix:")
print(correlation_matrix_adjusted)

# Convert the original correlation data (corr) to a DataFrame
original_corr_df = pd.DataFrame(corr)
original_corr_df.set_index('features', inplace=True)

# Display the original correlation data
print("\nOriginal Correlation Matrix:")
print(original_corr_df)

# Plot the correlation matrix heatmaps for both adjusted and original data side by side
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
fig.suptitle("Comparison of Correlation Matrices", fontsize=16)

# Adjusted correlation matrix heatmap
axes[0].imshow(correlation_matrix_adjusted, cmap='coolwarm', interpolation='nearest')
axes[0].set_xticks(np.arange(len(correlation_matrix_adjusted)))
axes[0].set_yticks(np.arange(len(correlation_matrix_adjusted)))
axes[0].set_xticklabels(correlation_matrix_adjusted.columns, rotation=45)
axes[0].set_yticklabels(correlation_matrix_adjusted.columns)
axes[0].set_title("Adjusted Correlation Matrix")

# Original correlation matrix heatmap
axes[1].imshow(original_corr_df, cmap='coolwarm', interpolation='nearest')
axes[1].set_xticks(np.arange(len(original_corr_df)))
axes[1].set_yticks(np.arange(len(original_corr_df)))
axes[1].set_xticklabels(original_corr_df.columns, rotation=45)
axes[1].set_yticklabels(original_corr_df.columns)
axes[1].set_title("Original Correlation Matrix")

plt.show()


In [None]:
corr = {
    'features': ['age', 'ind_risk', 'income', 'pers_exp', 'house_exp', 'taxes', 'transp_telecom', 'hobby'],
    'age': [1, -0.00665947056405372, 0.00291644965339247, 0.0107779942638097, 0.00698674581731255, 0.00729153655132963, 0.0099866509330216, 0.00931630696561133],
    'ind_risk': [-0.00665947056405372, 1, 0.0039918072709289, 0.00806259039194059, 0.00457023635440603, 0.0061985340641631, 0.00768699810849585, -0.00332322616613201],
    'income': [0.00291644965339247, 0.0039918072709289, 1, 0.560949334881676, 0.58892666343229, 0.581907424628933, 0.562946509689962, 0.352350802339294],
    'pers_exp': [0.0107779942638097, 0.00806259039194059, 0.560949334881676, 1, 0.928449923861951, 0.929598634668897, 0.934775947642248, 0.714298364869941],
    'house_exp': [0.00698674581731255, 0.00457023635440603, 0.58892666343229, 0.928449923861951, 1, 0.93031279279417, 0.927846735467478, 0.679286362990223],
    'taxes': [0.00729153655132963, 0.0061985340641631, 0.581907424628933, 0.929598634668897, 0.93031279279417, 1, 0.92920510128812, 0.689442053350162],
    'transp_telecom': [0.0099866509330216, 0.00768699810849585, 0.562946509689962, 0.934775947642248, 0.927846735467478, 0.92920510128812, 1, 0.714114127908189],
    'hobby': [0.00931630696561133, -0.00332322616613201, 0.352350802339294, 0.714298364869941, 0.679286362990223, 0.689442053350162, 0.714114127908189, 1]
}

l_corr_matrix = []

for k, v in corr.items():
    if k == "features":
        continue
    else:
        l_corr_matrix.append(v)

        
arr_corr_matrix = np.array(l_corr_matrix)

mean = [50.58, 0.6039, 13579.30, 5305.00, 2260, 1500, 1335, 1740]
num_samples = 1000

In [None]:
def generate_correlated_random_variables(mean, covariance, num_samples):
    L = np.linalg.cholesky(covariance)
    normal_samples = np.random.normal(size=(num_samples, covariance.shape[0]))
    return mean + np.dot(normal_samples, L.T)

In [None]:
samples = generate_correlated_random_variables(mean, arr_corr_matrix, num_samples, pers_exp_data)

In [None]:
import numpy as np

def generate_correlated_random_variables(mean, covariance, num_samples, pers_exp):
    # Assuming arr_corr_matrix is the covariance matrix with shape (1000, 1000)
    # Assuming mean is the mean array with shape (1000,)

    # Reshape pers_exp from (1000,) to (1000, 1)
    reshaped_array = pers_exp.reshape(-1, 1)

    # Assuming L is your covariance matrix with shape (8, 8)
    L = covariance[:8, :8]  # Extract the first 8 rows and columns

    # Perform matrix multiplication
    result = mean + np.dot(reshaped_array, L.T).flatten()
    return result


In [None]:
samples = generate_correlated_random_variables(mean, arr_corr_matrix, num_samples, pers_exp_data)

In [None]:
import plotly.express as px

def plot_distribution(data, column_name):
    """
    Plots the distribution of a pandas column using Plotly.

    Parameters:
        data (pd.DataFrame): The pandas DataFrame containing the data.
        column_name (str): The name of the column to plot.

    Returns:
        None
    """
    # Ensure the column exists in the DataFrame
    if column_name not in data.columns:
        raise ValueError(f"Column '{column_name}' not found in the DataFrame.")

    # Use Plotly Express to plot the distribution
    fig = px.histogram(data, x=column_name, nbins=50, title=f'Distribution of {column_name}')
    fig.show()


In [None]:
for col in synthetic_df.columns:
    plot_distribution(synthetic_df, col)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ... (Code from the previous answer)

# Calculate the correlation matrix for the adjusted DataFrame
correlation_matrix_adjusted = synthetic_df.corr()


# Display the correlation matrix
print("Adjusted Correlation Matrix:")
print(correlation_matrix_adjusted)

# Convert the original correlation data (corr) to a DataFrame
original_corr_df = pd.DataFrame(corr)
original_corr_df.set_index('features', inplace=True)

# Display the original correlation data
print("\nOriginal Correlation Matrix:")
print(original_corr_df)

# Plot the correlation matrix heatmaps for both adjusted and original data side by side
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
fig.suptitle("Comparison of Correlation Matrices", fontsize=16)

# Adjusted correlation matrix heatmap
axes[0].imshow(correlation_matrix_adjusted, cmap='coolwarm', interpolation='nearest')
axes[0].set_xticks(np.arange(len(correlation_matrix_adjusted)))
axes[0].set_yticks(np.arange(len(correlation_matrix_adjusted)))
axes[0].set_xticklabels(correlation_matrix_adjusted.columns, rotation=45)
axes[0].set_yticklabels(correlation_matrix_adjusted.columns)
axes[0].set_title("Adjusted Correlation Matrix")

# Original correlation matrix heatmap
axes[1].imshow(original_corr_df, cmap='coolwarm', interpolation='nearest')
axes[1].set_xticks(np.arange(len(original_corr_df)))
axes[1].set_yticks(np.arange(len(original_corr_df)))
axes[1].set_xticklabels(original_corr_df.columns, rotation=45)
axes[1].set_yticklabels(original_corr_df.columns)
axes[1].set_title("Original Correlation Matrix")

plt.show()


In [None]:
# Function to measure the distance between two correlation matrices using Frobenius norm
def correlation_distance(matrix1, matrix2):
    return np.linalg.norm(matrix1 - matrix2, ord='fro')


current_distance = correlation_distance(matrix1=correlation_matrix_adjusted, matrix2=original_corr_df)
current_distance

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Load the real-world data
mu = 5305.00
sigma = 700
# sigma = [0.10,0.44,0.33,0.14]
# Fit a probability distribution to the real-world data


distribution = np.random.normal(mu, sigma, 1000)

# Generate synthetic data using Monte Carlo simulation
synthetic_data_MC = np.random.choice(distribution, size=100)

# Plot the real-world data and the synthetic data
plt.hist(synthetic_data_MC, bins=50, alpha=0.5, label='Synthetic data')
plt.legend()
plt.show()


# TODO: I have to bin it afterwards

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def generate_pers_exp_data(size):
    # Define the mean and standard deviation for the normal distribution
    mean = 5305.00
    std_dev = 600  # You can adjust this value to control the spread of the data

    # Generate random data following the normal distribution
    data = np.random.normal(loc=mean, scale=std_dev, size=size)

    # Map the data to the specified bins
    bins = [0.10, 0.44, 0.33, 0.14]
    bin_edges = [4500, 5000, 5500, 6000]
    data = np.digitize(data, bin_edges, right=True) - 1

    # Adjust values to fit within each bin
    for i in range(len(bins)):
        idx = (data == i)
        data[idx] = np.clip(data[idx], 0, bin_edges[i] - 1) / 100.0

    return data

# Set the desired size of the synthetic data
size_of_data = 1000

# Generate the synthetic data
synthetic_data = generate_pers_exp_data(size_of_data)

# Plot the histogram to visualize the distribution
plt.hist(synthetic_data, bins=50, edgecolor='black')
plt.xlabel('pers_exp')
plt.ylabel('Frequency')
plt.title('Distribution of Synthetic pers_exp Data')
plt.grid(True)
plt.show()


In [None]:
import numpy as np
import pandas as pd
corr = {
    'features': ['age', 'ind_risk', 'income', 'pers_exp', 'house_exp', 'taxes', 'transp_telecom', 'hobby'],
    'age': [1, -0.00665947056405372, 0.00291644965339247, 0.0107779942638097, 0.00698674581731255, 0.00729153655132963, 0.0099866509330216, 0.00931630696561133],
    'ind_risk': [-0.00665947056405372, 1, 0.0039918072709289, 0.00806259039194059, 0.00457023635440603, 0.0061985340641631, 0.00768699810849585, -0.00332322616613201],
    'income': [0.00291644965339247, 0.0039918072709289, 1, 0.560949334881676, 0.58892666343229, 0.581907424628933, 0.562946509689962, 0.352350802339294],
    'pers_exp': [0.0107779942638097, 0.00806259039194059, 0.560949334881676, 1, 0.928449923861951, 0.929598634668897, 0.934775947642248, 0.714298364869941],
    'house_exp': [0.00698674581731255, 0.00457023635440603, 0.58892666343229, 0.928449923861951, 1, 0.93031279279417, 0.927846735467478, 0.679286362990223],
    'taxes': [0.00729153655132963, 0.0061985340641631, 0.581907424628933, 0.929598634668897, 0.93031279279417, 1, 0.92920510128812, 0.689442053350162],
    'transp_telecom': [0.0099866509330216, 0.00768699810849585, 0.562946509689962, 0.934775947642248, 0.927846735467478, 0.92920510128812, 1, 0.714114127908189],
    'hobby': [0.00931630696561133, -0.00332322616613201, 0.352350802339294, 0.714298364869941, 0.679286362990223, 0.689442053350162, 0.714114127908189, 1]
}

In [None]:
l_corr_matrix = []

for k, v in corr.items():
    if k == "features":
        continue
    else:
        l_corr_matrix.append(v)

        
arr_corr_matrix = np.array(l_corr_matrix)

In [None]:
def generate_correlated_random_variables(mean, covariance, num_samples):
    L = np.linalg.cholesky(covariance)
    
    # TO BE REPLACED BY DATA GENERATED FROM THE OLD FUNCTION THAT RESPECTS THE BOUNDARIES (generate_synthetic_data_with_bounds)
    normal_samples = np.random.normal(size=(num_samples, covariance.shape[0]))
    print("normal_samples shape", normal_samples.shape)
    return mean + np.dot(normal_samples, L.T)

In [None]:
n_samples = 10000

samples = generate_correlated_random_variables(mean=means,
                                               covariance=arr_corr_matrix,
                                               num_samples=n_samples)
df_test = pd.DataFrame(samples, columns=possible_values.keys())

In [None]:
df_test.describe()

In [None]:
# import numpy as np
# from scipy.stats import norm
# from sklearn.preprocessing import MinMaxScaler


# def generate_synthetic_data_with_bounds(correlation_matrix, num_samples, feature_bounds):
#     num_features = correlation_matrix.shape[0]
#     lower_bounds, upper_bounds = zip(*feature_bounds)
#     scaler = MinMaxScaler(feature_range=(0, 1))
    
#     # Check if the correlation matrix is valid (symmetric and positive definite)
#     if not np.allclose(correlation_matrix, correlation_matrix.T):
#         raise ValueError("Correlation matrix must be symmetric.")
#     if not np.all(np.linalg.eigvals(correlation_matrix) > 0):
#         raise ValueError("Correlation matrix must be positive definite.")
    
#     # Generate synthetic data using multivariate normal distribution
#     mean = np.zeros(num_features)
#     synthetic_data = np.random.multivariate_normal(mean, correlation_matrix, num_samples)
    
#     # Apply Gaussian copula to maintain correlation structure
#     synthetic_data = norm.cdf(synthetic_data)
    
#     # Scale the data to the specified bounds for each feature
#     for i in range(num_features):
#         synthetic_data[:, i] = lower_bounds[i] + synthetic_data[:, i] * (upper_bounds[i] - lower_bounds[i])
    
#     return synthetic_data

In [None]:
# feature_bounds = []

# possible_values = {
#     'age': [20, 86],
#     'ind_risk': [0, 1],
#     'income': [0, 150000],
#     'pers_exp': [0, 6000],
#     'house_exp': [0, 4000],
#     'taxes': [0, 2500],
#     'transp_telecom': [0, 2500],
#     'hobby': [0, 3000],
# }


# for v in possible_values.values():
    
#     feature_bounds.append(tuple(v))

# # Outcome of (generate_synthetic_data_with_bounds) goes as synthetic_samples in the following function

# # Example usage:
# correlation_matrix = np.array(l_corr_matrix)

# num_samples = 10000

# synthetic_samp = generate_synthetic_data_with_bounds(correlation_matrix=correlation_matrix, 
#                                                      num_samples=num_samples, 
#                                                      feature_bounds=feature_bounds)

# mean = []

# for k, v in possible_values.items():
#     avg = sum(v)/len(v)
#     mean.append(avg)  

    
# print("mean", mean)

# def generate_correlated_random_variables_V2(mean, 
#                                             covariance, 
#                                             synthetic_samples):
    
    
#     L = np.linalg.cholesky(covariance)
#     print("normal_samples shape", synthetic_samples.shape)
#     return mean + np.dot(synthetic_samples, L.T)

In [None]:
# samples = generate_correlated_random_variables_V2(mean=mean, 
#                                             covariance=correlation_matrix, 
#                                             synthetic_samples=synthetic_samp)
# df_test = pd.DataFrame(samples, columns=possible_values.keys())

In [None]:
# Check the correlation of the generated variables
correlation = df_test.corr()
print("Correlation matrix:")
print(correlation)

In [None]:
# Visualize the distribution of the generated variables
sns.jointplot(x='age', y='income', data=df_test)
plt.show()

## Plot distributions

In [None]:
import plotly.express as px

def plot_distribution(data, column_name):
    """
    Plots the distribution of a pandas column using Plotly.

    Parameters:
        data (pd.DataFrame): The pandas DataFrame containing the data.
        column_name (str): The name of the column to plot.

    Returns:
        None
    """
    # Ensure the column exists in the DataFrame
    if column_name not in data.columns:
        raise ValueError(f"Column '{column_name}' not found in the DataFrame.")

    # Use Plotly Express to plot the distribution
    fig = px.histogram(data, x=column_name, nbins=50, title=f'Distribution of {column_name}')
    fig.show()


In [None]:
for col in df_test.columns:
    plot_distribution(df_test, col)

## Plot the two correlation matrices

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ... (Code from the previous answer)

# Calculate the correlation matrix for the adjusted DataFrame
correlation_matrix_adjusted = df_test.corr()


# Display the correlation matrix
print("Adjusted Correlation Matrix:")
print(correlation_matrix_adjusted)

# Convert the original correlation data (corr) to a DataFrame
original_corr_df = pd.DataFrame(corr)
original_corr_df.set_index('features', inplace=True)

# Display the original correlation data
print("\nOriginal Correlation Matrix:")
print(original_corr_df)

# Plot the correlation matrix heatmaps for both adjusted and original data side by side
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
fig.suptitle("Comparison of Correlation Matrices", fontsize=16)

# Adjusted correlation matrix heatmap
axes[0].imshow(correlation_matrix_adjusted, cmap='coolwarm', interpolation='nearest')
axes[0].set_xticks(np.arange(len(correlation_matrix_adjusted)))
axes[0].set_yticks(np.arange(len(correlation_matrix_adjusted)))
axes[0].set_xticklabels(correlation_matrix_adjusted.columns, rotation=45)
axes[0].set_yticklabels(correlation_matrix_adjusted.columns)
axes[0].set_title("Adjusted Correlation Matrix")

# Original correlation matrix heatmap
axes[1].imshow(original_corr_df, cmap='coolwarm', interpolation='nearest')
axes[1].set_xticks(np.arange(len(original_corr_df)))
axes[1].set_yticks(np.arange(len(original_corr_df)))
axes[1].set_xticklabels(original_corr_df.columns, rotation=45)
axes[1].set_yticklabels(original_corr_df.columns)
axes[1].set_title("Original Correlation Matrix")

plt.show()


In [None]:
# Function to measure the distance between two correlation matrices using Frobenius norm
def correlation_distance(matrix1, matrix2):
    return np.linalg.norm(matrix1 - matrix2, ord='fro')


current_distance = correlation_distance(matrix1=correlation_matrix_adjusted, matrix2=original_corr_df)
current_distance

## Run the distribution fitter to check the distributions of each feature

In [None]:
# # Define the inputs and their probability distributions
inputs = {"input_1": {"mean": 10, "std_dev": 2},
          "input_2": {"mean": 20, "std_dev": 3}}

# Number of iterations
num_iterations = 10000

# Storage for simulation results
results = []

# Run the simulation
for i in range(num_iterations):
    # Generate random values for each input based on its distribution
    input_1 = np.random.normal(inputs["input_1"]["mean"], inputs["input_1"]["std_dev"])
    input_2 = np.random.normal(inputs["input_2"]["mean"], inputs["input_2"]["std_dev"])

    # Calculate the output of the model
    output = input_1**2 +input_1*input_2+ input_2**2

    # Store the result
    results.append(output)

In [None]:

# Analyze the results
mean_result = np.mean(results)
std_dev_result = np.std(results)

# Visualize the results
sns.set_style('white')
sns.set_context("paper", font_scale = 2)

sns.displot(data=results, x=results, kind="hist", bins = 100, aspect = 1.5)


# Print the results
print("Mean result: ", mean_result)
print("Standard deviation of result: ", std_dev_result)


In [None]:
for col in df_test.columns:
    
    print("col: ", col)
    vals = list(df_test[col])
    
    f = Fitter(vals, distributions=get_distributions(),timeout=120, bins=50)#,xmin=0.1, xmax=1 ) 
    #get_common_distributions() ##['gamma','lognorm',"beta","burr","norm"]
    f.fit()

    f.summary()

    break

In [None]:
df_test["age"].describe()

In [None]:
df_test["ind_risk"].describe()

In [None]:
# feature_bounds = []

# possible_values = {
#     'age': [20, 86],
#     'ind_risk': [0, 1],
#     'income': [0, 150000],
#     'pers_exp': [0, 6000],
#     'house_exp': [0, 4000],
#     'taxes': [0, 2500],
#     'transp_telecom': [0, 2500],
#     'hobby': [0, 3000],
# }



# for v in possible_values.values():
    
#     feature_bounds.append(tuple(v))


# import numpy as np
# from scipy.stats import norm
# from sklearn.preprocessing import MinMaxScaler

# def generate_synthetic_data_with_bounds(correlation_matrix, num_samples, feature_bounds):
#     num_features = correlation_matrix.shape[0]
#     lower_bounds, upper_bounds = zip(*feature_bounds)
#     scaler = MinMaxScaler(feature_range=(0, 1))
    
#     # Check if the correlation matrix is valid (symmetric and positive definite)
#     if not np.allclose(correlation_matrix, correlation_matrix.T):
#         raise ValueError("Correlation matrix must be symmetric.")
#     if not np.all(np.linalg.eigvals(correlation_matrix) > 0):
#         raise ValueError("Correlation matrix must be positive definite.")
    
#     # Generate synthetic data using multivariate normal distribution
#     mean = np.zeros(num_features)
#     synthetic_data = np.random.multivariate_normal(mean, correlation_matrix, num_samples)
    
#     # Apply Gaussian copula to maintain correlation structure
#     synthetic_data = norm.cdf(synthetic_data)
    
#     # Scale the data to the specified bounds for each feature
#     for i in range(num_features):
#         synthetic_data[:, i] = lower_bounds[i] + synthetic_data[:, i] * (upper_bounds[i] - lower_bounds[i])
    
#     return synthetic_data

# # Example usage:
# correlation_matrix = np.array(l_corr_matrix)

# num_samples = 250000

# synthetic_data = generate_synthetic_data_with_bounds(correlation_matrix, num_samples, feature_bounds)
# print(synthetic_data)


In [None]:
# feature_bounds = []

# possible_values = {
#     'age': [20, 86],
#     'ind_risk': [0, 1],
#     'income': [0, 150000],
#     'pers_exp': [0, 6000],
#     'house_exp': [0, 4000],
#     'taxes': [0, 2500],
#     'transp_telecom': [0, 2500],
#     'hobby': [0, 3000],
# }



# for v in possible_values.values():
    
#     feature_bounds.append(tuple(v))


# import numpy as np
# from scipy.stats import norm
# from sklearn.preprocessing import MinMaxScaler

# def generate_synthetic_data_with_bounds(correlation_matrix, num_samples, feature_bounds):
#     num_features = correlation_matrix.shape[0]
#     lower_bounds, upper_bounds = zip(*feature_bounds)
#     scaler = MinMaxScaler(feature_range=(0, 1))
    
#     # Check if the correlation matrix is valid (symmetric and positive definite)
#     if not np.allclose(correlation_matrix, correlation_matrix.T):
#         raise ValueError("Correlation matrix must be symmetric.")
#     if not np.all(np.linalg.eigvals(correlation_matrix) > 0):
#         raise ValueError("Correlation matrix must be positive definite.")
    
#     # Generate synthetic data using multivariate normal distribution
#     mean = np.zeros(num_features)
#     synthetic_data = np.random.multivariate_normal(mean, correlation_matrix, num_samples)
    
#     # Apply Gaussian copula to maintain correlation structure
#     synthetic_data = norm.cdf(synthetic_data)
    
#     # Scale the data to the specified bounds for each feature
#     for i in range(num_features):
#         synthetic_data[:, i] = lower_bounds[i] + synthetic_data[:, i] * (upper_bounds[i] - lower_bounds[i])
    
#     return synthetic_data

# # Example usage:
# correlation_matrix = np.array(l_corr_matrix)

# num_samples = 250000

# synthetic_data = generate_synthetic_data_with_bounds(correlation_matrix, num_samples, feature_bounds)
# print(synthetic_data)


# Old tests

## TODO: Personal expenses

## Apply business rules

In [None]:
# test = f"[(df_merged['marit_stat'] =='Married') & (df_merged['house_memb'] >'2')]"
test = f"[(df_merged['age'] <25) & (df_merged['lv_educ'] !='Higher')]"

list_mask = eval(test)
df_merged[list_mask[0]]

In [None]:
df_merged.columns

In [None]:
df_merged.loc[(df_merged['marit_stat'] =='Married') & (df_merged['house_memb'] >'2')]

## Kolmogorov-Smirnov categorical tests

In [None]:
from scipy.stats import ks_2samp

old = {'sex': {'labels': ['M', 'F'], 'values': [0.4854368932, 0.5145631068]}}
new = {'sex': {'labels': ['M', 'F'], 'values': [0.476, 0.524]}}

# Extract the values for each category in the old and new distributions
old_values = old['sex']['values']
new_values = new['sex']['values']

# Perform the KS test
ks_statistic, p_value = ks_2samp(old_values, new_values)

# Define the significance level (alpha) to test against the p-value
alpha = 0.05

# Check if the p-value is less than the significance level
if p_value < alpha:
    print("The new distribution is significantly different from the old distribution.")
else:
    print("The new distribution is not significantly different from the old distribution.")


In [None]:
from scipy.stats import ks_2samp
import numpy as np

np.random.seed(12345678)
x = np.random.normal(0, 1, 1000)
y = np.random.normal(0, 1, 1000)
z = np.random.normal(1.1, 0.9, 1000)

b = np.array([1,2,3,4,5])
c = np.array([1,2,3,4,5])
d = np.array([50, 100, 200, 400, 800])

a = 0.05

res = ks_2samp(x, y)
res2 = ks_2samp(b,c)
res3 = ks_2samp(b,d)
print("x,y ks: ", ks_2samp(x, y))
# Ks_2sampResult(statistic=0.022999999999999909, pvalue=0.95189016804849647)
print("x,z ks: ", ks_2samp(x, z))
print("b,c ks: ", ks_2samp(b, c))
print("b,d ks: ", ks_2samp(b, d))


# Ks_2sampResult(statistic=0.41800000000000004, pvalue=3.7081494119242173e-77)

In [None]:
res3.pvalue <= a
# False means Statistically insignificant - or the two distributions are considered the same!!!