# libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# Import data 
bo_data = pd.read_csv("C:/Users/ntybl/OneDrive/Documents/Data Science for ECON/BurnoutData.csv")
bo_data.head()


# Delete "Employee ID" since its not relevant for this analysis 
bo_data.drop(columns=['Employee ID'], inplace=True)


# Convert 'Date of Joining' to datetime format
bo_data['Date of Joining'] = pd.to_datetime(bo_data['Date of Joining'])


# Summary statistics of the numerical variables 
summary_stats = bo_data[['Designation', 'Resource Allocation', 'Mental Fatigue Score', 'Burn Rate',]].describe()
print(summary_stats)

        Designation  Resource Allocation  Mental Fatigue Score     Burn Rate
count  22750.000000         21369.000000          20633.000000  21626.000000
mean       2.178725             4.481398              5.728188      0.452005
std        1.135145             2.047211              1.920839      0.198226
min        0.000000             1.000000              0.000000      0.000000
25%        1.000000             3.000000              4.600000      0.310000
50%        2.000000             4.000000              5.900000      0.450000
75%        3.000000             6.000000              7.100000      0.590000
max        5.000000            10.000000             10.000000      1.000000


# Histogram of 'Mental Fatigue Score' and 'Burn Rate'

plot_df = bo_data[['Mental Fatigue Score', 'Burn Rate']]
# Plot histograms
plot_df.hist(bins=30, figsize=(10, 6), color='blue', edgecolor='black')
plt.show()


# Scatter plot of 'Mental Fatigue Score' against 'Burn Rate'
plt.figure(figsize=(8, 6))
plt.scatter(bo_data['Mental Fatigue Score'], bo_data['Burn Rate'], color='blue', alpha=0.5)
plt.title('Scatter Plot of Mental Fatigue Score vs. Burn Rate')
plt.xlabel('Mental Fatigue Score')
plt.ylabel('Burn Rate')
plt.grid(True)
plt.show()


# line plot of 'Mental Fatigue Score' against 'Date of Joining'

line_plt = bo_data[['Date of Joining', 'Mental Fatigue Score']]

# Extract month and year from 'Date of Joining'
line_plt['Joining Month'] = line_plt['Date of Joining'].dt.to_period('M')

# Group by 'Joining Month' and calculate average 'Mental Fatigue Score' for each month
monthly_avg_score = line_plt.groupby('Joining Month')['Mental Fatigue Score'].mean()

# Plot line plot
plt.figure(figsize=(10, 6))
plt.plot(monthly_avg_score.index.to_timestamp(), monthly_avg_score.values, color='blue')
plt.title('Average Mental Fatigue Score by Joining Month')
plt.xlabel('Joining Month')
plt.ylabel('Average Mental Fatigue Score')
plt.grid(True)
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

C:\Users\ntybl\AppData\Local\Temp\ipykernel_11756\2710531583.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  line_plt['Joining Month'] = line_plt['Date of Joining'].dt.to_period('M')


# Bar chart of 'Burn Rate' against 'Designation' 
burn_designation = bo_data.groupby('Designation')['Burn Rate'].mean()

# Plot bar chart
plt.figure(figsize=(8, 6))
plt.bar(burn_designation.index, burn_designation.values, color='blue')
plt.title('Average Burn Rate by Designation Level')
plt.xlabel('Designation Level')
plt.ylabel('Average Burn Rate')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.show()


# Bar chart of 'Burn Rate' against 'Gender'
burn_gender = bo_data.groupby('Gender')['Burn Rate'].mean()

# Plot bar chart
plt.figure(figsize=(6, 6))
plt.bar(burn_gender.index, burn_gender.values, color='green')
plt.title('Average Burn Rate by Gender')
plt.xlabel('Gender')
plt.ylabel('Average Burn Rate')
plt.grid(axis='y')
plt.show()


# Bar chart of 'Burn Rate' against 'Gender'
burn_WFH = bo_data.groupby('WFH Setup Available')['Burn Rate'].mean()

# Plot bar chart
plt.figure(figsize=(6, 6))
plt.bar(burn_WFH.index, burn_WFH.values, color='pink')
plt.title('Average Burn Rate by WFH')
plt.xlabel('WFH')
plt.ylabel('Average Burn Rate')
plt.grid(axis='y')
plt.show()


dataplot = sns.heatmap(bo_data.corr(), cmap="YlGnBu", annot=True) 
plt.show()


# Check for null values
null_percentage = (bo_data.isnull().sum() / len(bo_data)) * 100

print("Null Percentage by Column:")
print(null_percentage)

Null Percentage by Column:
Date of Joining         0.000000
Gender                  0.000000
Company Type            0.000000
WFH Setup Available     0.000000
Designation             0.000000
Resource Allocation     6.070330
Mental Fatigue Score    9.305495
Burn Rate               4.940659
dtype: float64


from scipy.stats import skew
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer


# Convert categorical variables into binary variables 

bo_data.replace({'Gender': {'Male': 1, 'Female': 0},
            'Company Type': {'Service': 1, 'Product': 0},
            'WFH Setup Available': {'Yes': 1, 'No': 0}}, inplace=True)


# Split data into predictors and and classes (outcome variable).
predictors = bo_data[['Gender', 'Company Type', 'WFH Setup Available', 'Designation', 'Resource Allocation']]
classes = bo_data[['Burn Rate']]

# look at the shapes
print(predictors.shape)
print(classes.shape)

(22750, 5)
(22750, 1)


# Use k-nearest neighbors to impute missing values 

# Initialize KNN imputer
imputer = KNNImputer(n_neighbors=3)  # You can adjust the number of neighbors as needed

# Impute missing values
predictors_imputed = imputer.fit_transform(predictors)

# Convert the array back to a DataFrame 
predictors_imputed = pd.DataFrame(predictors_imputed, columns=predictors.columns)


# Impute missing values of classes using mean
classes_imputed = classes.fillna(classes.mean())


# Check for skewness 
columns_to_check = ['Designation', 'Resource Allocation']

# Calculate skewness for each column
for col in columns_to_check:
    skewness = skew(predictors_imputed[col])
    print(f"Skewness of {col}: {skewness}")

Skewness of Designation: 0.09241529095974846
Skewness of Resource Allocation: 0.20676562406952995


# I will now apply PCA  to my predictors 

# Standardize the predictors
scaler = StandardScaler()
scaled_predictors = scaler.fit_transform(predictors_imputed)

# Apply PCA
pca = PCA()
pca.fit(scaled_predictors)

# Transform the predictors into principal components
predictor_pca = pca.transform(scaled_predictors)


print('Variance explained by PCs {0}'.format(pca.explained_variance_ratio_))

Variance explained by PCs [0.41039126 0.20066018 0.19228763 0.17384157 0.02281937]


# Plot explained variance ratio
plt.figure(figsize=(8, 5))
plt.bar(range(1, len(pca.explained_variance_ratio_) + 1), pca.explained_variance_ratio_, color='skyblue')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('Explained Variance Ratio of Principal Components')
plt.show()


from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso
import statsmodels.api as sm


# Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(predictors_imputed, classes_imputed, test_size=0.2, random_state=42)

# Create and fit linear regression
X_train_sm = sm.add_constant(X_train)  # Add constant term
model_sm = sm.OLS(y_train, X_train_sm)
results_sm = model_sm.fit()

# Make predictions on testing set
X_test_sm = sm.add_constant(X_test)  # Add constant term
y_pred = results_sm.predict(X_test_sm)

# Calculate MSE
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (Linear):", mse)

# Calculate AIC
aic = results_sm.aic
print("AIC (Linear):", aic)

Mean Squared Error (Linear): 0.011598094478036369
AIC (Linear): -29607.080420509024


# Plot actual vs predicted values
plt.scatter(y_test, y_pred, color='blue', alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--')
plt.xlabel('Actual Burnout Rate')
plt.ylabel('Predicted Burnout Rate')
plt.title('Actual vs Predicted Burnout Rate (Linear Regression)')
plt.show()


# Apply np.ravel
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)

# Fit the Lasso regression model
lasso_model = Lasso(alpha=0.1) 
lasso_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lasso_model.predict(X_test)

# Calculate MSE
mse_l = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (Lasso):", mse_l)

# Calculate RSS to calculate AIC later
rss = np.sum((y_pred - y_test) ** 2)

# Number features
n_predictors = X_train.shape[1]

# Calculate the number of observations
n_obs = len(y_test)

# Calculate the degrees of freedom
df = n_obs - n_predictors - 1

# Calculate the AIC 
aic = n_obs * np.log(rss / n_obs) + 2 * n_predictors
print("AIC (Lasso):", aic)

Mean Squared Error (Lasso): 0.013770525810345529
AIC (Lasso): -19487.772757009512


# Fit the Lasso regression model with a lower alpha value
lasso_model2 = Lasso(alpha=0.001)
lasso_model2.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lasso_model2.predict(X_test)

# MSE
mse_l2 = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (Lasso2):", mse_l2)

# RSS
rss = np.sum((y_pred - y_test) ** 2)

# Number of features
n_predictors = X_train.shape[1]

# Observations
n_obs = len(y_test)

# Degrees of freedom
df = n_obs - n_predictors - 1

# AIC
aic = n_obs * np.log(rss / n_obs) + 2 * n_predictors
print("AIC (Lasso2):", aic)

Mean Squared Error (Lasso2): 0.011606493044358195
AIC (Lasso2): -20265.667194807742


# KNN model
knn_model = KNeighborsRegressor(n_neighbors=5) 

# Fit the model
knn_model.fit(X_train, y_train)

# Make predictions
y_pred = knn_model.predict(X_test)

# Evaluate (MSE)
mse_knn = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (KNN):", mse_knn)

Mean Squared Error (KNN): 0.013328017433556338


# KNN model with 3 neighbors
knn_model2 = KNeighborsRegressor(n_neighbors=3) 

# Fit the model
knn_model2.fit(X_train, y_train)

# Make predictions
y_pred = knn_model2.predict(X_test)

# Evaluate (MSE)
mse_knn2 = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (KNN2):", mse_knn2)

Mean Squared Error (KNN2): 0.014377058149953001


# Initialize the Random Forest 
rf_model = RandomForestRegressor(n_estimators=100, random_state=44) 

# Fit the model on train
rf_model.fit(X_train, y_train)

# Make predictions on test
y_pred_rf = rf_model.predict(X_test)

# Calculate MSE
mse_rf = mean_squared_error(y_test, y_pred_rf)
print("Mean Squared Error (Random Forest):", mse_rf)

Mean Squared Error (Random Forest): 0.01117968504080362


# Random Forest with higher n_estimators
# Initialize the Random Forest 
rf_model2 = RandomForestRegressor(n_estimators=300, random_state=44) 

# Fit the model on train
rf_model2.fit(X_train, y_train)

# Make predictions on test
y_pred_rf = rf_model2.predict(X_test)

# Calculate MSE
mse_rf2 = mean_squared_error(y_test, y_pred_rf)
print("Mean Squared Error (Random Forest2):", mse_rf2)

Mean Squared Error (Random Forest2): 0.011178057893738682


from sklearn.model_selection import GridSearchCV


# Model Selection
print("Mean Squared Error (Linear):", mse)
print("Mean Squared Error (Lasso):", mse_l)
print("Mean Squared Error (Lasso2):", mse_l2)
print("Mean Squared Error (KNN):", mse_knn)
print("Mean Squared Error (KNN2):", mse_knn2)
print("Mean Squared Error (Random Forest):", mse_rf)
print("Mean Squared Error (Random Forest2):", mse_rf2)

Mean Squared Error (Linear): 0.013770525810345529
Mean Squared Error (Lasso): 0.013770525810345529
Mean Squared Error (Lasso2): 0.011606493044358195
Mean Squared Error (KNN): 0.013328017433556338
Mean Squared Error (KNN2): 0.014377058149953001
Mean Squared Error (Random Forest): 0.01117968504080362
Mean Squared Error (Random Forest2): 0.011178057893738682


# Random Forest

# Define parameters
param_grid = {
    'n_estimators': [300, 500, 700, 1000],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10, 15]
}

# Random Forest regressor
rf_model_g = RandomForestRegressor(random_state=44)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_model_g, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Perform grid search
grid_search.fit(X_train, y_train)

# Get the best model and its parameters
best_rf_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Print best parameters
print("Best parameters:", best_params)

# Make predictions on test data using the best model
y_pred_best_rf = best_rf_model.predict(X_test)

# Calculate MSE for the best model
mse_best_rf = mean_squared_error(y_test, y_pred_best_rf)
print("Mean Squared Error (Best Random Forest):", mse_best_rf)

Best parameters: {'max_depth': 10, 'min_samples_split': 15, 'n_estimators': 1000}
Mean Squared Error (Best Random Forest): 0.01116517121922181


# Visualize the variables that explain more than 1.3%

# Get the feature names
feature_names = X_train.columns

# Extract feature from the best Random Forest model
best_rf_feature_importances = best_rf_model.feature_importances_

# Create a DataFrame to store the feature importances
rf_model_var_imp = pd.DataFrame({'features': feature_names, 'importance': best_rf_feature_importances})

# Include only variables with importance > 0.013
important_variables = rf_model_var_imp[rf_model_var_imp['importance'] > 0.013]

# Sort by importance values
important_variables = important_variables.sort_values('importance')

# Plot bar chart
plt.figure(figsize=(8, 5))
plt.barh(important_variables['features'], important_variables['importance'])
plt.xlabel('Share of Total Explained')
plt.ylabel('Variable Name')
plt.title('Variable Importance (Top Variables)')
plt.show()

What Underlying Factors Contribute to Employee Burnout?¶

Natalia Blanco¶

Intro¶

Data Cleaning¶

Exploratory Data Analysis¶

EDA findings¶

Feature engineering¶

Model building¶

Linear Regression¶

Lasso Regression¶

KNN prediction¶

Random Forest¶

Model evaluation¶

Model Tuning¶

Conclusion¶

Findings¶

Insights¶

Conclusion regarding the process¶

References¶

	Employee ID	Date of Joining	Gender	Company Type	WFH Setup Available	Designation	Resource Allocation	Mental Fatigue Score	Burn Rate
0	fffe32003000360033003200	09/30/2008	Female	Service	No	2	3.0	3.8	0.16
1	fffe3700360033003500	11/30/2008	Male	Service	Yes	1	2.0	5.0	0.36
2	fffe31003300320037003900	03/10/2008	Female	Product	Yes	2	NaN	5.8	0.49
3	fffe32003400380032003900	11/03/2008	Male	Service	Yes	1	1.0	2.6	0.20
4	fffe31003900340031003600	07/24/2008	Female	Service	No	3	7.0	6.9	0.52