# Question 1: Pro male gender bias

import pandas as pd
import numpy as np
import statsmodels.api as sm

# Seed the RNG with Vaarun Muthappan's N number
N_Number = 18851315
np.random.seed(N_Number)

# Read the CSV file
num = pd.read_csv("rmpCapstoneNum.csv", names=[
    'Average Rating',
    'Average Difficulty',
    'Number of ratings',
    'Received a "pepper"?',
    'Proportion of students that would take the class again',
    'Number of ratings from online classes',
    'Male gender',
    'Female'], index_col=False)


# preprocess - remove NA, rows with <=25 ratings
num = num[num['Number of ratings'] > 25]
num = num.dropna()

# drop rows with both 1 or both 0 in 'Female' and 'Male gender' columns
num = num[~((num['Female'] == 1) & (num['Male gender'] == 1))]
num = num[~((num['Female'] == 0) & (num['Male gender'] == 0))]

# drop the 'Female' column as collinear
num = num.drop('Female', axis=1)
print(num['Male gender'].value_counts())

X_train = num[['Average Difficulty',
                 'Number of ratings',
                 'Received a "pepper"?',
                 'Proportion of students that would take the class again',
                 'Number of ratings from online classes',
                 'Male gender']]
y_train = num['Average Rating']
X_train = sm.add_constant(X_train)

model = sm.OLS(y_train, X_train).fit()

print(model.summary())

1    569
0    350
Name: Male gender, dtype: int64
                            OLS Regression Results                            
==============================================================================
Dep. Variable:         Average Rating   R-squared:                       0.853
Model:                            OLS   Adj. R-squared:                  0.852
Method:                 Least Squares   F-statistic:                     880.8
Date:                Thu, 02 Jan 2025   Prob (F-statistic):               0.00
Time:                        18:01:18   Log-Likelihood:                -267.67
No. Observations:                 919   AIC:                             549.3
Df Residuals:                     912   BIC:                             583.1
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
==========================================================================================================================
                                                             coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------------------------
const                                                      2.3765      0.092     25.862      0.000       2.196       2.557
Average Difficulty                                        -0.1827      0.019     -9.624      0.000      -0.220      -0.145
Number of ratings                                          0.0002      0.000      0.503      0.615      -0.001       0.001
Received a "pepper"?                                       0.2203      0.027      8.179      0.000       0.167       0.273
Proportion of students that would take the class again     0.0261      0.001     40.691      0.000       0.025       0.027
Number of ratings from online classes                     -0.0004      0.004     -0.107      0.914      -0.007       0.007
Male gender                                               -0.0057      0.022     -0.253      0.800      -0.050       0.038
==============================================================================
Omnibus:                      207.966   Durbin-Watson:                   1.869
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              771.921
Skew:                          -1.040   Prob(JB):                    2.40e-168
Kurtosis:                       6.979   Cond. No.                         826.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

# Question 2: Gender difference in spread of ratings?
import pandas as pd
import numpy as np
from scipy import stats

# Seed the RNG with Vaarun Muthappan's N number
N_Number = 18851315
np.random.seed(N_Number)

# Run OLS with qualitative factors, reasoning: these are confounders that are in dataset
# Read the CSV file
num = pd.read_csv("rmpCapstoneNum.csv", names=[
    'Average Rating',
    'Average Difficulty',
    'Number of ratings',
    'Received a "pepper"?',
    'Proportion of students that would take the class again',
    'Number of ratings from online classes',
    'Male gender',
    'Female'], index_col=False)


# Preprocess: remove NA, rows with <=25 ratings
num = num[num['Number of ratings'] > 25]
num = num.dropna(subset=['Female', 'Male gender','Average Rating'])

# Drop rows with both 1 or both 0 in 'Female' and 'Male gender' columns
num = num[~((num['Female'] == 1) & (num['Male gender'] == 1))]
num = num[~((num['Female'] == 0) & (num['Male gender'] == 0))]

# Separate ratings by gender
male_ratings = num[num['Male gender'] == 1]['Average Rating']
female_ratings = num[num['Male gender'] == 0]['Average Rating']

print("male / female ratings",len(male_ratings), len(female_ratings))

variance_male = np.var(male_ratings, ddof=1)
variance_female = np.var(female_ratings, ddof=1)

# Levene's test
stat_levene, p_value_levene = stats.levene(male_ratings, female_ratings)

print(f"Variance of male ratings: {variance_male:.4f}")
print(f"Variance of female ratings: {variance_female:.4f}")
print(f"Levene's test statistic: {stat_levene:.4f}")
print(f"p-value: {p_value_levene:.4f}")

alpha = 0.005  
if p_value_levene < alpha:
    print("There is a statistically significant difference in the spread of ratings between genders.")
else:
    print("There is no statistically significant difference in the spread of ratings between genders.")

male / female ratings 600 368
Variance of male ratings: 0.7194
Variance of female ratings: 0.7869
Levene's test statistic: 1.8664
p-value: 0.1722
There is no statistically significant difference in the spread of ratings between genders.

# Question 3: Size of these effects

import numpy as np
from scipy import stats

# Calculate means
male_mean = np.mean(male_ratings)
female_mean = np.mean(female_ratings)

# Calculate standard deviations
male_std = np.std(male_ratings, ddof=1)
female_std = np.std(female_ratings, ddof=1)

# Calculate pooled standard deviation
n1, n2 = len(male_ratings), len(female_ratings)
pooled_std = np.sqrt(((n1 - 1) * male_std**2 + (n2 - 1) * female_std**2) / (n1 + n2 - 2))

# Calculate Cohen's d
cohens_d = (male_mean - female_mean) / pooled_std

print(f"Cohen's d: {cohens_d}")

# Calculate degrees of freedom
df = len(male_ratings) + len(female_ratings) - 2

# Calculate standard error of d
se_d = np.sqrt((len(male_ratings) + len(female_ratings)) / (len(male_ratings) * len(female_ratings)) + 
               cohens_d**2 / (2 * (len(male_ratings) + len(female_ratings))))

# Calculate t-value for 95% CI
t_value = stats.t.ppf(0.975, df)  # 0.975 for two-tailed 95% CI

# Calculate lower and upper bounds of CI
ci_lower = cohens_d - t_value * se_d
ci_upper = cohens_d + t_value * se_d

print(f"95% Confidence Interval for Cohen's d: [{ci_lower:.3f}, {ci_upper:.3f}]")


# For gender bias in spread of average rating
pooled_std = np.sqrt((len(male_ratings) - 1) * variance_male + (len(female_ratings) - 1) * variance_female) / (len(male_ratings) + len(female_ratings) - 2)
cohens_d_spread = (np.sqrt(variance_male) - np.sqrt(variance_female)) / pooled_std

# Calculate 95% confidence interval for Cohen's d
n1, n2 = len(male_ratings), len(female_ratings)
se = np.sqrt((n1 + n2) / (n1 * n2) + cohens_d_spread**2 / (2 * (n1 + n2)))
ci_lower_spread = cohens_d_spread - 1.96 * se
ci_upper_spread = cohens_d_spread + 1.96 * se

print(f"Cohen's d for gender bias in spread of average rating: {cohens_d_spread:.4f}")
print(f"95% CI: [{ci_lower_spread:.4f}, {ci_upper_spread:.4f}]")

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Combine the data into a single dataset for plotting
ratings = [male_ratings, female_ratings]
labels = ['Male Ratings', 'Female Ratings']

# Create the violin plot
plt.figure(figsize=(8, 6))
sns.violinplot(data=ratings, palette='Set2')
plt.xticks(ticks=[0, 1], labels=labels)
plt.title('Violin Plot of Male and Female Ratings')
plt.ylabel('Ratings')
plt.xlabel('')
plt.show()

Cohen's d: 0.10658257253576335
95% Confidence Interval for Cohen's d: [-0.023, 0.237]
Cohen's d for gender bias in spread of average rating: -1.4017
95% CI: [-1.5458, -1.2577]

# Question 4: Gender dif. in each of 20 tags
import pandas as pd
import numpy as np
from scipy import stats

N_Number = 18851315

np.random.seed(N_Number)

# Read the CSV files
tags = pd.read_csv("rmpCapstoneTags.csv", names=[
    "Tough grader", 
    "Good feedback", 
    "Respected", 
    "Lots to read", 
    "Participation matters", 
    "Don't skip class or you will not pass", 
    "Lots of homework", 
    "Inspirational", 
    "Pop quizzes!", 
    "Accessible", 
    "So many papers", 
    "Clear grading", 
    "Hilarious", 
    "Test heavy", 
    "Graded by few things", 
    "Amazing lectures", 
    "Caring", 
    "Extra credit", 
    "Group projects", 
    "Lecture heavy"], index_col=False)

num = pd.read_csv("rmpCapstoneNum.csv", names=[
    'Average Rating',
    'Average Difficulty',
    'Number of ratings',
    'Received a "pepper"?',
    'Proportion of students that would take the class again',
    'Number of ratings from online classes',
    'Male gender',
    'Female'], index_col=False)

merged_df = pd.concat([num[['Male gender', 'Female', 'Number of ratings']], tags], axis=1)


# Preprocess: remove NA, rows with <=25 ratings
merged_df = merged_df[merged_df['Number of ratings'] > 25]
merged_df = merged_df.dropna(subset=tags.columns)
merged_df = merged_df.dropna(subset=['Number of ratings','Male gender'])


# Drop rows with both 1 or both 0 in 'Female' and 'Male gender' columns
merged_df = merged_df[~((merged_df['Female'] == 1) & (merged_df['Male gender'] == 1))]
merged_df = merged_df[~((merged_df['Female'] == 0) & (merged_df['Male gender'] == 0))]

# NORMALISE
tag_columns = tags.columns
for col in tag_columns:
    merged_df[col] = merged_df[col] / merged_df['Number of ratings']

# Drop the 'Female' column as collinear
merged_df = merged_df.drop(['Female','Number of ratings'], axis=1)
print(merged_df['Male gender'].value_counts())


# Store results
results = []

for tag in tag_columns:
    male_values = merged_df[merged_df['Male gender'] == 1][tag]
    female_values = merged_df[merged_df['Male gender'] == 0][tag]
    stat, p_value = stats.mannwhitneyu(male_values, female_values, alternative='two-sided')
    results.append((tag, stat,p_value))

# Convert results to DataFrame for easy sorting
results_df = pd.DataFrame(results, columns=['Tag', 'stat','p-value'])

# Sort by p-value
sorted_results = results_df.sort_values(by='p-value')

# Display results
print("Most Gendered Tags (Lowest p-values):")
print(sorted_results.head(3))

print("\nLeast Gendered Tags (Highest p-values):")
print(sorted_results.tail(3))

1    600
0    368
Name: Male gender, dtype: int64
Most Gendered Tags (Lowest p-values):
             Tag      stat       p-value
12     Hilarious  140543.5  7.343214e-13
2      Respected  129805.5  4.271263e-06
17  Extra credit   93699.0  5.935096e-05

Least Gendered Tags (Highest p-values):
              Tag      stat   p-value
8    Pop quizzes!  108246.5  0.535528
19  Lecture heavy  112616.5  0.595073
0    Tough grader  110058.0  0.935189

# Question 5: Gender dif. in terms of average difficulty?

import pandas as pd
import numpy as np
from scipy import stats

N_Number = 18851315

np.random.seed(N_Number)

# Read the CSV files
num = pd.read_csv("rmpCapstoneNum.csv", names=[
    'Average Rating',
    'Average Difficulty',
    'Number of ratings',
    'Received a "pepper"?',
    'Proportion of students that would take the class again',
    'Number of ratings from online classes',
    'Male gender',
    'Female'], index_col=False)


# Preprocess: remove NA, rows with <=25 ratings
num = num[num['Number of ratings'] > 25]
num = num.dropna(subset=['Female', 'Male gender','Average Rating','Average Difficulty'])

# Drop rows with both 1 or both 0 in 'Female' and 'Male gender' columns
num = num[~((num['Female'] == 1) & (num['Male gender'] == 1))]
num = num[~((num['Female'] == 0) & (num['Male gender'] == 0))]

# Separate ratings by gender
male_difficulty = num[num['Male gender'] == 1]['Average Difficulty']
female_difficulty = num[num['Male gender'] == 0]['Average Difficulty']
print(num['Male gender'].value_counts())

# Perform the KS test
ks_statistic, p_value = stats.mannwhitneyu(male_difficulty, female_difficulty)

# Print the results
print(f"Mann Whitney U test Statistic: {ks_statistic}")
print(f"P-value: {p_value}")

if p_value < 0.005:
    print("There is a statistically significant difference in the distributions of average difficulty between male and female professors.")
else:
    print("There is no statistically significant difference in the distributions of average difficulty between male and female professors.")

1    600
0    368
Name: Male gender, dtype: int64
Mann Whitney U test Statistic: 112107.5
P-value: 0.6857788248287642
There is no statistically significant difference in the distributions of average difficulty between male and female professors.

# Question 6: Size of above effect at 95% confidence interval

from scipy import stats
import numpy as np

# Calculate Cohen's d
def cohens_d(group1, group2):
    n1, n2 = len(group1), len(group2)
    var1, var2 = np.var(group1, ddof=1), np.var(group2, ddof=1)
    pooled_se = np.sqrt(((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2))
    return (np.mean(group1) - np.mean(group2)) / pooled_se

# Calculate Cohen's d
d = cohens_d(male_difficulty, female_difficulty)

print(len(male_difficulty), len(female_difficulty))

# Calculate standard error of d
n1, n2 = len(male_difficulty), len(female_difficulty)
se_d = np.sqrt((n1 + n2) / (n1 * n2) + d**2 / (2 * (n1 + n2)))

# Calculate 95% confidence interval
ci_lower = d - 1.96 * se_d
ci_upper = d + 1.96 * se_d

print(f"Cohen's d: {d:.4f}")
print(f"95% Confidence Interval: [{ci_lower:.4f}, {ci_upper:.4f}]")

600 368
Cohen's d: 0.0289
95% Confidence Interval: [-0.1009, 0.1587]

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler

# seed for reproducibility
N_Number = 18851315
np.random.seed(N_Number)

# columns to use
numerical_cols = [
    'Average Rating',
    'Average Difficulty',
    'Number of ratings',
    'Received a "pepper"?',
    'Proportion of students that would take the class again',
    'Number of ratings from online classes',
    'Male gender',
    'Female'
]

# load and preprocess data
num = num[num['Number of ratings'] > 25]  # filter rows with sufficient ratings
num = num.dropna()  # drop rows with missing values

# enforce mutual exclusivity in gender columns
num = num[~((num['Female'] == 1) & (num['Male gender'] == 1))]
num = num[~((num['Female'] == 0) & (num['Male gender'] == 0))]

# drop one of the gender columns (avoid redundancy)
num = num.drop('Female', axis=1)

X = num.drop('Average Rating', axis=1)
y = num['Average Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=N_Number)

# standardize predictors
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

def print_performance(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"{model_name} - test set performance:")
    print(f"r-squared: {r2:.4f}")
    print(f"rmse: {rmse:.4f}\n")

# ols regression
ols = LinearRegression()
ols.fit(X_train_scaled, y_train)

# print ols coefficients
ols_coefficients = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': ols.coef_
}).sort_values(by='Coefficient', key=abs, ascending=False)

print("\nols coefficients:")
print(ols_coefficients)

print_performance(ols, X_test_scaled, y_test, "ols")

# ridge regression
ridge_cv = GridSearchCV(Ridge(), {'alpha': [0.01, 0.1, 1.0, 10.0, 100.0, 200.0]}, cv=5)
ridge_cv.fit(X_train_scaled, y_train)

print(f"best ridge alpha: {ridge_cv.best_params_['alpha']:.4f}")
ridge_coefficients = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': ridge_cv.best_estimator_.coef_
}).sort_values(by='Coefficient', key=abs, ascending=False)

print("\nridge coefficients:")
print(ridge_coefficients)
print_performance(ridge_cv, X_test_scaled, y_test, "ridge")

# lasso regression
lasso_cv = GridSearchCV(Lasso(), {'alpha': [0.01, 0.1, 1.0, 10.0, 100.0, 200.0]}, cv=5)
lasso_cv.fit(X_train_scaled, y_train)

print(f"best lasso alpha: {lasso_cv.best_params_['alpha']:.4f}")
lasso_coefficients = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': lasso_cv.best_estimator_.coef_
})
lasso_nonzero = lasso_coefficients[lasso_coefficients['Coefficient'] != 0].sort_values(by='Coefficient', key=abs, ascending=False)

print("\nlasso non-zero coefficients:")
print(lasso_nonzero)
print_performance(lasso_cv, X_test_scaled, y_test, "lasso")

ols coefficients:
                                             Feature  Coefficient
3  Proportion of students that would take the cla...     0.610126
0                                 Average Difficulty    -0.126961
2                               Received a "pepper"?     0.121107
5                                        Male gender    -0.009086
1                                  Number of ratings     0.009050
4              Number of ratings from online classes    -0.003414
ols - test set performance:
r-squared: 0.8723
rmse: 0.3107

best ridge alpha: 10.0000

ridge coefficients:
                                             Feature  Coefficient
3  Proportion of students that would take the cla...     0.596541
0                                 Average Difficulty    -0.131580
2                               Received a "pepper"?     0.125409
1                                  Number of ratings     0.009700
5                                        Male gender    -0.007660
4              Number of ratings from online classes    -0.002964
ridge - test set performance:
r-squared: 0.8713
rmse: 0.3119

best lasso alpha: 0.0100

lasso non-zero coefficients:
                                             Feature  Coefficient
3  Proportion of students that would take the cla...     0.606264
0                                 Average Difficulty    -0.121337
2                               Received a "pepper"?     0.115826
lasso - test set performance:
r-squared: 0.8729
rmse: 0.3099

# Question 8: Rating vs tags
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
import random
from statsmodels.stats.outliers_influence import variance_inflation_factor


N_Number = 18851315

np.random.seed(N_Number)
random.seed(N_Number)

# Read the CSV files
tags = pd.read_csv("rmpCapstoneTags.csv", names=[
    "Tough grader", 
    "Good feedback", 
    "Respected", 
    "Lots to read", 
    "Participation matters", 
    "Don't skip class or you will not pass", 
    "Lots of homework", 
    "Inspirational", 
    "Pop quizzes!", 
    "Accessible", 
    "So many papers", 
    "Clear grading", 
    "Hilarious", 
    "Test heavy", 
    "Graded by few things", 
    "Amazing lectures", 
    "Caring", 
    "Extra credit", 
    "Group projects", 
    "Lecture heavy"], index_col=False)

num = pd.read_csv("rmpCapstoneNum.csv", names=[
    'Average Rating',
    'Average Difficulty',
    'Number of ratings',
    'Received a "pepper"?',
    'Proportion of students that would take the class again',
    'Number of ratings from online classes',
    'Male gender',
    'Female'], index_col=False)

# Compute VIF and correlation matrix for tag predictors to check collinearity
vif_data = pd.DataFrame()
vif_data["Feature"] = tags.columns  # Tag column names
vif_data["VIF"] = [variance_inflation_factor(tags.values, i) for i in range(tags.shape[1])]
print("Variance Inflation Factor (VIF):")
print(vif_data)

corr_matrix = tags.corr()
print(corr_matrix)


# Merge the dataframes to drop NAN before splitting into X,Y and train/test
merged_df = pd.concat([num, tags], axis=1)

# Only keep rows with >25 ratings
merged_df = merged_df[merged_df['Number of ratings'] > 25]

merged_df = merged_df[~((merged_df['Female'] == 1) & (merged_df['Male gender'] == 1))]
merged_df = merged_df[~((merged_df['Female'] == 0) & (merged_df['Male gender'] == 0))]

# Can drop "respect" and "caring" as they have the highest correlation (>0.7) with other columns
merged_df.drop(['Average Difficulty',
    'Received a "pepper"?',
    'Proportion of students that would take the class again',
    'Number of ratings from online classes',
    'Male gender',
    'Female'], axis=1, inplace=True)

tag_columns = tags.columns
for col in tag_columns:
    merged_df[col] = merged_df[col] / merged_df['Number of ratings']
    
merged_df.drop(['Caring','Respected','Number of ratings'], axis=1, inplace=True)
merged_df = merged_df.dropna()

# Split the data into train and test sets
X = merged_df.iloc[:, 2:]  # All columns from tags
y = merged_df['Average Rating']

print(merged_df.columns)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=N_Number)

# Function to print model summary and performance
def print_model_summary(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    print(f"\n{model_name} Results:")
    print(f"R-squared: {r2:.4f}")
    print(f"RMSE: {rmse:.4f}")

    if hasattr(model, 'coef_'):
        print("\nCoefficients:")
        sorted_features_coeffs = sorted(zip(X.columns, model.coef_), key=lambda x: (x[1]), reverse=True)
        for feature, coef in sorted_features_coeffs:
            print(f"{feature}: {coef:.4f}")

# OLS Regression
ols = LinearRegression()
ols_scores = cross_val_score(ols, X_train, y_train, cv=6, scoring='neg_mean_squared_error')
ols.fit(X_train, y_train)
print_model_summary(ols, X_test, y_test, "OLS Regression")

# Scale the features (X) and target variable (y)
scaler_X = StandardScaler()
scaler_y = StandardScaler()

# Fit the scaler on the training data and transform both train and test sets
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

# Fit the scaler on the training target and transform both train and test targets
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1)).flatten()
y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1)).flatten()

# Ridge Regression
ridge_cv = RidgeCV(alphas=[0.01, 0.1, 1.0, 10.0, 100.0, 200.0], cv=6)
ridge_cv.fit(X_train_scaled, y_train_scaled)
print(f"\nBest alpha for Ridge: {ridge_cv.alpha_:.4f}")
print_model_summary(ridge_cv, X_test_scaled, y_test_scaled, "Ridge Regression")

# LASSO Regression
lasso_cv = LassoCV(alphas=[0.01, 0.1, 1.0, 10.0, 100.0, 200.0], cv=6, random_state=N_Number)
lasso_cv.fit(X_train_scaled, y_train_scaled)
print(f"\nBest alpha for LASSO: {lasso_cv.alpha_:.4f}")
print_model_summary(lasso_cv, X_test_scaled, y_test_scaled, "LASSO Regression")

Variance Inflation Factor (VIF):
                                  Feature       VIF
0                            Tough grader  2.528223
1                           Good feedback  3.407101
2                               Respected  4.038748
3                            Lots to read  1.797317
4                   Participation matters  2.199905
5   Don't skip class or you will not pass  2.544241
6                        Lots of homework  1.906057
7                           Inspirational  2.963519
8                            Pop quizzes!  1.207973
9                              Accessible  1.881011
10                         So many papers  1.258024
11                          Clear grading  2.358959
12                              Hilarious  2.069158
13                             Test heavy  1.738232
14                   Graded by few things  1.422271
15                       Amazing lectures  3.015469
16                                 Caring  3.653912
17                           Extra credit  1.573000
18                         Group projects  1.254131
19                          Lecture heavy  1.907449
                                       Tough grader  Good feedback  Respected  \
Tough grader                               1.000000       0.229954   0.171610   
Good feedback                              0.229954       1.000000   0.686683   
Respected                                  0.171610       0.686683   1.000000   
Lots to read                               0.499682       0.311697   0.243321   
Participation matters                      0.281985       0.576367   0.498679   
Don't skip class or you will not pass      0.570231       0.341158   0.373045   
Lots of homework                           0.527124       0.304581   0.260685   
Inspirational                              0.140145       0.590619   0.727672   
Pop quizzes!                               0.302569       0.118450   0.139579   
Accessible                                 0.255412       0.511265   0.499878   
So many papers                             0.282819       0.267818   0.130270   
Clear grading                              0.221003       0.586601   0.550932   
Hilarious                                  0.167575       0.448301   0.567985   
Test heavy                                 0.465068       0.113567   0.168253   
Graded by few things                       0.265465       0.151854   0.180604   
Amazing lectures                           0.189300       0.499613   0.702187   
Caring                                     0.208095       0.707304   0.740289   
Extra credit                               0.190986       0.371942   0.407863   
Group projects                             0.185720       0.261789   0.253329   
Lecture heavy                              0.493056       0.156719   0.201295   

                                       Lots to read  Participation matters  \
Tough grader                               0.499682               0.281985   
Good feedback                              0.311697               0.576367   
Respected                                  0.243321               0.498679   
Lots to read                               1.000000               0.390119   
Participation matters                      0.390119               1.000000   
Don't skip class or you will not pass      0.373778               0.424883   
Lots of homework                           0.330470               0.325916   
Inspirational                              0.240350               0.474341   
Pop quizzes!                               0.253375               0.140631   
Accessible                                 0.214175               0.339283   
So many papers                             0.284022               0.176382   
Clear grading                              0.278827               0.442919   
Hilarious                                  0.211889               0.422364   
Test heavy                                 0.282345               0.157590   
Graded by few things                       0.223241               0.191287   
Amazing lectures                           0.280375               0.433445   
Caring                                     0.250726               0.492437   
Extra credit                               0.229563               0.377400   
Group projects                             0.158490               0.355646   
Lecture heavy                              0.397294               0.234684   

                                       Don't skip class or you will not pass  \
Tough grader                                                        0.570231   
Good feedback                                                       0.341158   
Respected                                                           0.373045   
Lots to read                                                        0.373778   
Participation matters                                               0.424883   
Don't skip class or you will not pass                               1.000000   
Lots of homework                                                    0.502055   
Inspirational                                                       0.279040   
Pop quizzes!                                                        0.315635   
Accessible                                                          0.376565   
So many papers                                                      0.159098   
Clear grading                                                       0.429774   
Hilarious                                                           0.389384   
Test heavy                                                          0.493759   
Graded by few things                                                0.331314   
Amazing lectures                                                    0.401750   
Caring                                                              0.384829   
Extra credit                                                        0.360939   
Group projects                                                      0.214164   
Lecture heavy                                                       0.494568   

                                       Lots of homework  Inspirational  \
Tough grader                                   0.527124       0.140145   
Good feedback                                  0.304581       0.590619   
Respected                                      0.260685       0.727672   
Lots to read                                   0.330470       0.240350   
Participation matters                          0.325916       0.474341   
Don't skip class or you will not pass          0.502055       0.279040   
Lots of homework                               1.000000       0.168358   
Inspirational                                  0.168358       1.000000   
Pop quizzes!                                   0.274497       0.092585   
Accessible                                     0.345145       0.401629   
So many papers                                 0.196875       0.147548   
Clear grading                                  0.345238       0.392426   
Hilarious                                      0.183263       0.522413   
Test heavy                                     0.336636       0.099004   
Graded by few things                           0.141825       0.140856   
Amazing lectures                               0.193409       0.692467   
Caring                                         0.338463       0.631511   
Extra credit                                   0.330417       0.305918   
Group projects                                 0.164798       0.231867   
Lecture heavy                                  0.320914       0.128204   

                                       Pop quizzes!  Accessible  \
Tough grader                               0.302569    0.255412   
Good feedback                              0.118450    0.511265   
Respected                                  0.139579    0.499878   
Lots to read                               0.253375    0.214175   
Participation matters                      0.140631    0.339283   
Don't skip class or you will not pass      0.315635    0.376565   
Lots of homework                           0.274497    0.345145   
Inspirational                              0.092585    0.401629   
Pop quizzes!                               1.000000    0.147522   
Accessible                                 0.147522    1.000000   
So many papers                             0.058107    0.102073   
Clear grading                              0.146541    0.502592   
Hilarious                                  0.124886    0.373559   
Test heavy                                 0.183711    0.258560   
Graded by few things                       0.097084    0.220145   
Amazing lectures                           0.147970    0.429340   
Caring                                     0.127997    0.599456   
Extra credit                               0.137479    0.352118   
Group projects                             0.069418    0.175262   
Lecture heavy                              0.229478    0.263272   

                                       So many papers  Clear grading  \
Tough grader                                 0.282819       0.221003   
Good feedback                                0.267818       0.586601   
Respected                                    0.130270       0.550932   
Lots to read                                 0.284022       0.278827   
Participation matters                        0.176382       0.442919   
Don't skip class or you will not pass        0.159098       0.429774   
Lots of homework                             0.196875       0.345238   
Inspirational                                0.147548       0.392426   
Pop quizzes!                                 0.058107       0.146541   
Accessible                                   0.102073       0.502592   
So many papers                               1.000000       0.116832   
Clear grading                                0.116832       1.000000   
Hilarious                                    0.104625       0.417913   
Test heavy                                   0.036550       0.264296   
Graded by few things                         0.082795       0.284087   
Amazing lectures                             0.079873       0.469776   
Caring                                       0.139631       0.585987   
Extra credit                                 0.061238       0.465386   
Group projects                               0.106371       0.238718   
Lecture heavy                                0.121763       0.319024   

                                       Hilarious  Test heavy  \
Tough grader                            0.167575    0.465068   
Good feedback                           0.448301    0.113567   
Respected                               0.567985    0.168253   
Lots to read                            0.211889    0.282345   
Participation matters                   0.422364    0.157590   
Don't skip class or you will not pass   0.389384    0.493759   
Lots of homework                        0.183263    0.336636   
Inspirational                           0.522413    0.099004   
Pop quizzes!                            0.124886    0.183711   
Accessible                              0.373559    0.258560   
So many papers                          0.104625    0.036550   
Clear grading                           0.417913    0.264296   
Hilarious                               1.000000    0.241308   
Test heavy                              0.241308    1.000000   
Graded by few things                    0.257546    0.416451   
Amazing lectures                        0.634742    0.221910   
Caring                                  0.497715    0.183579   
Extra credit                            0.384361    0.231502   
Group projects                          0.288900    0.099936   
Lecture heavy                           0.202682    0.475742   

                                       Graded by few things  Amazing lectures  \
Tough grader                                       0.265465          0.189300   
Good feedback                                      0.151854          0.499613   
Respected                                          0.180604          0.702187   
Lots to read                                       0.223241          0.280375   
Participation matters                              0.191287          0.433445   
Don't skip class or you will not pass              0.331314          0.401750   
Lots of homework                                   0.141825          0.193409   
Inspirational                                      0.140856          0.692467   
Pop quizzes!                                       0.097084          0.147970   
Accessible                                         0.220145          0.429340   
So many papers                                     0.082795          0.079873   
Clear grading                                      0.284087          0.469776   
Hilarious                                          0.257546          0.634742   
Test heavy                                         0.416451          0.221910   
Graded by few things                               1.000000          0.222411   
Amazing lectures                                   0.222411          1.000000   
Caring                                             0.188489          0.571739   
Extra credit                                       0.182106          0.367060   
Group projects                                     0.134005          0.238078   
Lecture heavy                                      0.386136          0.238837   

                                         Caring  Extra credit  Group projects  \
Tough grader                           0.208095      0.190986        0.185720   
Good feedback                          0.707304      0.371942        0.261789   
Respected                              0.740289      0.407863        0.253329   
Lots to read                           0.250726      0.229563        0.158490   
Participation matters                  0.492437      0.377400        0.355646   
Don't skip class or you will not pass  0.384829      0.360939        0.214164   
Lots of homework                       0.338463      0.330417        0.164798   
Inspirational                          0.631511      0.305918        0.231867   
Pop quizzes!                           0.127997      0.137479        0.069418   
Accessible                             0.599456      0.352118        0.175262   
So many papers                         0.139631      0.061238        0.106371   
Clear grading                          0.585987      0.465386        0.238718   
Hilarious                              0.497715      0.384361        0.288900   
Test heavy                             0.183579      0.231502        0.099936   
Graded by few things                   0.188489      0.182106        0.134005   
Amazing lectures                       0.571739      0.367060        0.238078   
Caring                                 1.000000      0.438533        0.231486   
Extra credit                           0.438533      1.000000        0.200623   
Group projects                         0.231486      0.200623        1.000000   
Lecture heavy                          0.224560      0.289317        0.183903   

                                       Lecture heavy  
Tough grader                                0.493056  
Good feedback                               0.156719  
Respected                                   0.201295  
Lots to read                                0.397294  
Participation matters                       0.234684  
Don't skip class or you will not pass       0.494568  
Lots of homework                            0.320914  
Inspirational                               0.128204  
Pop quizzes!                                0.229478  
Accessible                                  0.263272  
So many papers                              0.121763  
Clear grading                               0.319024  
Hilarious                                   0.202682  
Test heavy                                  0.475742  
Graded by few things                        0.386136  
Amazing lectures                            0.238837  
Caring                                      0.224560  
Extra credit                                0.289317  
Group projects                              0.183903  
Lecture heavy                               1.000000  
Index(['Average Rating', 'Tough grader', 'Good feedback', 'Lots to read',
       'Participation matters', 'Don't skip class or you will not pass',
       'Lots of homework', 'Inspirational', 'Pop quizzes!', 'Accessible',
       'So many papers', 'Clear grading', 'Hilarious', 'Test heavy',
       'Graded by few things', 'Amazing lectures', 'Extra credit',
       'Group projects', 'Lecture heavy'],
      dtype='object')

OLS Regression Results:
R-squared: 0.7196
RMSE: 0.4460

Coefficients:
Amazing lectures: 1.5931
Clear grading: 1.2293
Good feedback: 1.1381
Inspirational: 1.0018
Accessible: 0.7893
Hilarious: 0.7184
Extra credit: 0.6528
Participation matters: 0.3864
Pop quizzes!: 0.0779
Don't skip class or you will not pass: -0.1214
Lots to read: -0.6500
Lots of homework: -0.6567
Group projects: -0.7286
So many papers: -0.8571
Graded by few things: -0.9535
Lecture heavy: -1.0738
Test heavy: -1.1652

Best alpha for Ridge: 100.0000

Ridge Regression Results:
R-squared: 0.7230
RMSE: 0.5111

Coefficients:
Amazing lectures: 0.2430
Good feedback: 0.2185
Clear grading: 0.1562
Inspirational: 0.1444
Hilarious: 0.1392
Extra credit: 0.0973
Accessible: 0.0823
Participation matters: 0.0601
Pop quizzes!: 0.0001
Don't skip class or you will not pass: -0.0316
So many papers: -0.0458
Graded by few things: -0.0505
Group projects: -0.0643
Test heavy: -0.1018
Lots to read: -0.1137
Lots of homework: -0.1257
Lecture heavy: -0.1445

Best alpha for LASSO: 0.0100

LASSO Regression Results:
R-squared: 0.7234
RMSE: 0.5107

Coefficients:
Amazing lectures: 0.2709
Good feedback: 0.2474
Clear grading: 0.1632
Inspirational: 0.1427
Hilarious: 0.1414
Extra credit: 0.1018
Accessible: 0.0788
Participation matters: 0.0502
Pop quizzes!: 0.0000
Don't skip class or you will not pass: -0.0140
So many papers: -0.0398
Graded by few things: -0.0422
Group projects: -0.0579
Test heavy: -0.0968
Lots to read: -0.1124
Lots of homework: -0.1216
Lecture heavy: -0.1485

# Question 9
import numpy as np
import pandas as pd 
from scipy.stats import f
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor

num_data = pd.read_csv('rmpCapstoneNum.csv')
qual_data = pd.read_csv('rmpCapstoneQual.csv')
tag_data = pd.read_csv('rmpCapstoneTags.csv')

## set alpha level
alpha = 0.005

N_Number = 18851315

## same N_number, keep >25 samples, dropna, drop rows that are both 0 or both 1, drop female column
np.random.seed(N_Number)

num_data.columns = [
    'Average Rating', 
    'Average Difficulty', 
    'Number of Ratings', 
    'Received a Pepper', 
    'Proportion Take Again', 
    'Number of Online Ratings', 
    'Male', 
    'Female'
]

# Rename columns in tag_data
tag_data.columns = [
    "Tough grader",
    "Good feedback",
    "Respected",
    "Lots to read",
    "Participation matters",
    "Don’t skip class or you will not pass",
    "Lots of homework",
    "Inspirational",
    "Pop quizzes!",
    "Accessible",
    "So many papers",
    "Clear grading",
    "Hilarious",
    "Test heavy",
    "Graded by few things",
    "Amazing lectures",
    "Caring",
    "Extra credit",
    "Group projects",
    "Lecture heavy"
]

num_data = pd.concat([num_data, tag_data, qual_data], axis=1)
num_data = num_data[num_data['Male'] != num_data['Female']]
num_data = num_data[num_data['Number of Ratings'] > 25]
num_data = num_data.drop(columns=['Female'])

# Display the first few rows of the cleaned dataset
print(num_data.head())

# Print the shape of the cleaned dataset
print(f"dataset shape: {num_data.shape}")

num_data['Male'].value_counts()

filtered_num_data = num_data.dropna()

filtered_num_data['Male'].value_counts()

filtered_num_data = num_data

filtered_num_data.shape

tag_columns = [
    'Tough grader', 'Good feedback', 'Respected', 'Lots to read',
    'Participation matters', 'Don’t skip class or you will not pass',
    'Lots of homework', 'Inspirational', 'Pop quizzes!', 'Accessible',
    'So many papers', 'Clear grading', 'Hilarious', 'Test heavy',
    'Graded by few things', 'Amazing lectures', 'Caring', 'Extra credit',
    'Group projects', 'Lecture heavy'
]

# Normalize tags by dividing each tag column by 'Number of Ratings'
filtered_num_data[tag_columns] = filtered_num_data[tag_columns].div(filtered_num_data['Number of Ratings'], axis=0)

# Define predictors and target variable
X = filtered_num_data[tag_columns]
y = filtered_num_data['Average Difficulty']

# Run VIF on the tag columns
vif_data = pd.DataFrame()
vif_data['Feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print("Variance Inflation Factor (VIF):")
print(vif_data)

# Drop 'Caring' and 'Respected' due to high VIF
X = X.drop(columns=['Caring', 'Respected'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=N_Number)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train and Evaluate OLS Regression
ols_model = LinearRegression()
ols_model.fit(X_train_scaled, y_train)
ols_y_pred = ols_model.predict(X_test_scaled)

# Evaluate OLS
ols_r2 = r2_score(y_test, ols_y_pred)
ols_rmse = mean_squared_error(y_test, ols_y_pred, squared=False)

print(f"OLS Regression - R²: {ols_r2:.4f}, RMSE: {ols_rmse:.4f}")

# Identify most predictive tags in OLS
ols_coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': ols_model.coef_
}).sort_values(by='Coefficient', key=abs, ascending=False)

print("OLS Regression Coefficients:")
print(ols_coefficients)

# Train and Evaluate Ridge Regression
ridge_model = RidgeCV(alphas=[0.1, 1, 10, 100, 200], cv=5)  # 5-fold cross-validation
ridge_model.fit(X_train_scaled, y_train)
ridge_y_pred = ridge_model.predict(X_test_scaled)

# Evaluate Ridge
ridge_r2 = r2_score(y_test, ridge_y_pred)
ridge_rmse = mean_squared_error(y_test, ridge_y_pred, squared=False)

print(f"Ridge Regression - Best Alpha: {ridge_model.alpha_}")
print(f"Ridge Regression - R²: {ridge_r2:.4f}, RMSE: {ridge_rmse:.4f}")

# Identify most predictive tags in Ridge
ridge_coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': ridge_model.coef_
}).sort_values(by='Coefficient', key=abs, ascending=False)

print("Ridge Regression Coefficients:")
print(ridge_coefficients)

# Train and Evaluate Lasso Regression
lasso_model = LassoCV(alphas=[0.1, 1, 10, 100, 200], cv=5, random_state=N_Number)  # 5-fold cross-validation
lasso_model.fit(X_train_scaled, y_train)
lasso_y_pred = lasso_model.predict(X_test_scaled)

# Evaluate Lasso
lasso_r2 = r2_score(y_test, lasso_y_pred)
lasso_rmse = mean_squared_error(y_test, lasso_y_pred, squared=False)

print(f"Lasso Regression - Best Alpha: {lasso_model.alpha_}")
print(f"Lasso Regression - R²: {lasso_r2:.4f}, RMSE: {lasso_rmse:.4f}")

# Identify most predictive tags in Lasso
lasso_coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': lasso_model.coef_
}).sort_values(by='Coefficient', key=abs, ascending=False)

print("Lasso Regression Coefficients:")
print(lasso_coefficients)

# Compare Models
print("\nModel Comparison:")
print(f"OLS: R² = {ols_r2:.4f}, RMSE = {ols_rmse:.4f}")
print(f"Ridge: R² = {ridge_r2:.4f}, RMSE = {ridge_rmse:.4f}")
print(f"Lasso: R² = {lasso_r2:.4f}, RMSE = {lasso_rmse:.4f}")

     Average Rating  Average Difficulty  Number of Ratings  Received a Pepper  \
45              4.2                 1.8               26.0                1.0   
118             3.2                 3.9               29.0                0.0   
123             5.0                 1.9               60.0                1.0   
169             4.8                 2.6               26.0                0.0   
198             3.0                 3.8               26.0                0.0   

     Proportion Take Again  Number of Online Ratings  Male  Tough grader  \
45                    57.0                       8.0     1             0   
118                   50.0                       0.0     1            14   
123                  100.0                       4.0     0             2   
169                  100.0                       0.0     1             0   
198                   70.0                       0.0     0            12   

     Good feedback  Respected  ...  Test heavy  Graded by few things  \
45               1          0  ...           0                     2   
118              3          2  ...           0                     0   
123             12         22  ...           1                     1   
169              3         13  ...           1                     0   
198              1          1  ...           5                     4   

     Amazing lectures  Caring  Extra credit  Group projects  Lecture heavy  \
45                 11       1             2               0              2   
118                 2       5             9               0              2   
123                 8      36             4               0              1   
169                12       9             0               0              0   
198                 4       0             0               2              8   

           Criminal Justice                  George Mason University  VA  
45                  History                    Pasadena City College  CA  
118               Chemistry  University of Colorado Colorado Springs  CO  
123  MathDevelopmental Math                  Lone Star College (all)  TX  
169             Mathematics      University of Wisconsin - Milwaukee  WI  
198                 Biology                 University of Louisville  KY  

[5 rows x 30 columns]
dataset shape: (968, 30)
Variance Inflation Factor (VIF):
                                  Feature       VIF
0                            Tough grader  3.281639
1                           Good feedback  3.987277
2                               Respected  4.945778
3                            Lots to read  2.079865
4                   Participation matters  2.454495
5   Don’t skip class or you will not pass  3.163318
6                        Lots of homework  1.878281
7                           Inspirational  3.564262
8                            Pop quizzes!  1.245734
9                              Accessible  2.283465
10                         So many papers  1.375255
11                          Clear grading  2.856751
12                              Hilarious  2.121150
13                             Test heavy  2.140123
14                   Graded by few things  1.828563
15                       Amazing lectures  3.649584
16                                 Caring  5.061086
17                           Extra credit  1.694762
18                         Group projects  1.321729
19                          Lecture heavy  2.423144
OLS Regression - R²: 0.6543, RMSE: 0.4275
OLS Regression Coefficients:
                                  Feature  Coefficient
0                            Tough grader     0.310902
8                              Accessible     0.137767
12                             Test heavy     0.119827
10                          Clear grading    -0.106689
4   Don’t skip class or you will not pass     0.093906
2                            Lots to read     0.090998
11                              Hilarious    -0.090754
15                           Extra credit    -0.073131
5                        Lots of homework     0.064294
13                   Graded by few things    -0.039153
6                           Inspirational    -0.031628
1                           Good feedback    -0.020221
14                       Amazing lectures     0.019113
17                          Lecture heavy     0.014774
9                          So many papers     0.010938
3                   Participation matters    -0.006292
7                            Pop quizzes!     0.003608
16                         Group projects    -0.001595
Ridge Regression - Best Alpha: 10.0
Ridge Regression - R²: 0.6541, RMSE: 0.4276
Ridge Regression Coefficients:
                                  Feature  Coefficient
0                            Tough grader     0.304869
8                              Accessible     0.135161
12                             Test heavy     0.118969
10                          Clear grading    -0.106407
4   Don’t skip class or you will not pass     0.094248
11                              Hilarious    -0.090628
2                            Lots to read     0.090616
15                           Extra credit    -0.073471
5                        Lots of homework     0.064203
13                   Graded by few things    -0.038519
6                           Inspirational    -0.031820
1                           Good feedback    -0.021367
14                       Amazing lectures     0.016883
17                          Lecture heavy     0.015742
9                          So many papers     0.011777
3                   Participation matters    -0.007006
7                            Pop quizzes!     0.003653
16                         Group projects    -0.001640
Lasso Regression - Best Alpha: 0.1
Lasso Regression - R²: 0.5415, RMSE: 0.4924
Lasso Regression Coefficients:
                                  Feature  Coefficient
0                            Tough grader     0.358396
12                             Test heavy     0.057843
4   Don’t skip class or you will not pass     0.044740
11                              Hilarious    -0.034100
10                          Clear grading    -0.020305
8                              Accessible     0.010367
5                        Lots of homework     0.006425
2                            Lots to read     0.005779
16                         Group projects    -0.000000
15                           Extra credit    -0.000000
14                       Amazing lectures    -0.000000
13                   Graded by few things     0.000000
9                          So many papers     0.000000
1                           Good feedback    -0.000000
7                            Pop quizzes!     0.000000
6                           Inspirational    -0.000000
3                   Participation matters    -0.000000
17                          Lecture heavy     0.000000

Model Comparison:
OLS: R² = 0.6543, RMSE = 0.4275
Ridge: R² = 0.6541, RMSE = 0.4276
Lasso: R² = 0.5415, RMSE = 0.4924

# Question 10: pepper from tags + numerical
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve
import random

N_Number = 18851315 

random.seed(N_Number)
np.random.seed(N_Number)

num = pd.read_csv("rmpCapstoneNum.csv", names=[
    'Average Rating',
    'Average Difficulty',
    'Number of ratings',
    'Received a "pepper"?',
    'Proportion of students that would take the class again',
    'Number of ratings from online classes',
    'Male gender',
    'Female'], index_col=False)

tags = pd.read_csv("rmpCapstoneTags.csv", names=[
    "Tough grader", 
    "Good feedback", 
    "Respected", 
    "Lots to read", 
    "Participation matters", 
    "Don't skip class or you will not pass", 
    "Lots of homework", 
    "Inspirational", 
    "Pop quizzes!", 
    "Accessible", 
    "So many papers", 
    "Clear grading", 
    "Hilarious", 
    "Test heavy", 
    "Graded by few things", 
    "Amazing lectures", 
    "Caring", 
    "Extra credit", 
    "Group projects", 
    "Lecture heavy"], index_col=False)

# Combine the dataframes
df = pd.concat([num, tags], axis=1)

# Preprocess the data
# Keep only >25 ratings as average only makes sense then
df = df[df['Number of ratings'] > 25]

# Drop rows with both 1 or both 0 in 'Female' and 'Male gender' columns
df = df[~((df['Female'] == 1) & (df['Male gender'] == 1))]
df = df[~((df['Female'] == 0) & (df['Male gender'] == 0))]

# Drop NA and cols with colinearity, eg female, caring and respect cols
df = df.drop(['Female','Caring','Respected'], axis=1)
df = df.dropna()

# Normalize tag columns by number of ratings
tag_columns = tags.columns
for col in tag_columns:
    if col != 'Caring' and col != 'Respected':
        df[col] = df[col] / df['Number of ratings']

# Prepare features and target variable
X = df.drop('Received a "pepper"?', axis=1)
y = df['Received a "pepper"?']

print(X.columns)

# Display the number of values in each class of 'Received a "pepper"?'
print("Class distribution:")
print(y.value_counts())


# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=N_Number, stratify=y)

num_columns = ['Average Rating',
    'Average Difficulty',
    'Number of ratings',
   # 'Received a "pepper"?',
    'Proportion of students that would take the class again',
    'Number of ratings from online classes',
    'Male gender']
scaler = StandardScaler()
X_train[num_columns] = scaler.fit_transform(X_train[num_columns])
X_test[num_columns] = scaler.transform(X_test[num_columns])

# Run logistic regression
model = LogisticRegression(random_state=N_Number, class_weight = 'balanced')
model.fit(X_train, y_train)

# Predict probabilities and calculate AUROC
y_pred_proba = model.predict_proba(X_test)[:, 1]
auroc = roc_auc_score(y_test, y_pred_proba)

print(f"\nAUROC: {auroc:.4f}")
print("Coefficients:")
for feature, coef in sorted(zip(X_train.columns, model.coef_[0]), key=lambda x: (x[1]), reverse=True):
    print(f"{feature}: {coef:.4f}")

# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label='ROC Curve (area = {:.2f})'.format(auroc))
plt.plot([0, 1], [0, 1], color='red', linestyle='--')  # Diagonal line for random guessing
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid()
plt.show()

Index(['Average Rating', 'Average Difficulty', 'Number of ratings',
       'Proportion of students that would take the class again',
       'Number of ratings from online classes', 'Male gender', 'Tough grader',
       'Good feedback', 'Lots to read', 'Participation matters',
       'Don't skip class or you will not pass', 'Lots of homework',
       'Inspirational', 'Pop quizzes!', 'Accessible', 'So many papers',
       'Clear grading', 'Hilarious', 'Test heavy', 'Graded by few things',
       'Amazing lectures', 'Extra credit', 'Group projects', 'Lecture heavy'],
      dtype='object')
Class distribution:
1.0    551
0.0    368
Name: Received a "pepper"?, dtype: int64

AUROC: 0.9085
Coefficients:
Inspirational: 1.9448
Amazing lectures: 1.5378
Average Rating: 1.4424
Hilarious: 0.6536
Proportion of students that would take the class again: 0.1912
Average Difficulty: 0.1681
Tough grader: 0.1155
Number of ratings from online classes: 0.0917
Number of ratings: -0.0257
Extra credit: -0.0497
Group projects: -0.0687
Graded by few things: -0.0710
So many papers: -0.0802
Good feedback: -0.0920
Lots to read: -0.1020
Don't skip class or you will not pass: -0.1413
Male gender: -0.2135
Participation matters: -0.2489
Test heavy: -0.2537
Clear grading: -0.2844
Pop quizzes!: -0.2846
Accessible: -0.4476
Lots of homework: -0.5069
Lecture heavy: -0.8455

## Extra Credit

import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import pandas as pd 
import sqlite3
import os 
import math
from scipy.stats import mannwhitneyu
from scipy.stats import ks_2samp
from scipy.stats import kruskal
from scipy.stats import levene
from scipy.stats import f
import re

# Load the dataset
num_data = pd.read_csv('rmpCapstoneNum.csv')
qual_data = pd.read_csv('rmpCapstoneQual.csv')
tag_data = pd.read_csv('rmpCapstoneTags.csv')

## set alpha level
alpha = 0.005

N_Number = 18851315

## same N_number, keep >25 samples, dropna, drop rows that are both 0 or both 1, drop female column

np.random.seed(N_Number)


import pandas as pd

# Assign meaningful column names to num_data
num_data.columns = [
    'Average Rating', 
    'Average Difficulty', 
    'Number of Ratings', 
    'Received a Pepper', 
    'Proportion Take Again', 
    'Number of Online Ratings', 
    'Male', 
    'Female'
]


qual_data.columns = [
    "Major", "University", "State"
    ]

qual_data.head()
tag_data.columns = [
    "Tough grader",
    "Good feedback",
    "Respected",
    "Lots to read",
    "Participation matters",
    "Don’t skip class or you will not pass",
    "Lots of homework",
    "Inspirational",
    "Pop quizzes!",
    "Accessible",
    "So many papers",
    "Clear grading",
    "Hilarious",
    "Test heavy",
    "Graded by few things",
    "Amazing lectures",
    "Caring",
    "Extra credit",
    "Group projects",
    "Lecture heavy"
]

# Combine Gender Data with Tag Data
num_data = pd.concat([num_data, tag_data, qual_data], axis=1)

# Drop rows where both Male and Female are 0 or both are 1
num_data = num_data[num_data['Male'] != num_data['Female']]
# Filter out professors with 25 or fewer ratings
num_data = num_data[num_data['Number of Ratings'] > 25]
# Drop the 'Female' column
num_data = num_data.drop(columns=['Female'])

# Display the first few rows of the cleaned dataset
print(num_data.head())

# Print the shape of the cleaned dataset
print(f"dataset shape: {num_data.shape}")

num_data['Male'].value_counts()

filtered_num_data = num_data

filtered_num_data.shape


## check sample sizes by state

import matplotlib.pyplot as plt
import seaborn as sns

# Count the number of entries by state
state_distribution = filtered_num_data['State'].value_counts()

# Print the state distribution
print("Distribution of Entries by State:")
print(state_distribution)

# Plot the distribution
plt.figure(figsize=(12, 8))
sns.barplot(x=state_distribution.index, y=state_distribution.values, palette="viridis")
plt.title("Distribution of Entries by State", fontsize=16)
plt.xlabel("State", fontsize=14)
plt.ylabel("Number of Entries", fontsize=14)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()



## do both KS and mann whitney U
from scipy.stats import ks_2samp, mannwhitneyu
import seaborn as sns
import matplotlib.pyplot as plt

# Filter data for Florida and California
fl_data = filtered_num_data[filtered_num_data['State'] == 'FL']
ca_data = filtered_num_data[filtered_num_data['State'] == 'CA']

# Extract the 'Average Difficulty' column for both states
fl_difficulty = fl_data['Average Difficulty'].dropna()
ca_difficulty = ca_data['Average Difficulty'].dropna()

# Ensure both datasets are not empty
if fl_difficulty.empty or ca_difficulty.empty:
    print("One or both datasets are empty. Ensure the 'State' column has correct values.")
else:
    # Perform KS test
    ks_stat, ks_p_value = ks_2samp(fl_difficulty, ca_difficulty)

    # Perform Mann-Whitney U test
    mw_stat, mw_p_value = mannwhitneyu(fl_difficulty, ca_difficulty, alternative='two-sided')

    # Print KS test results
    print("Kolmogorov-Smirnov Test Results:")
    print(f"KS Statistic: {ks_stat:.4f}")
    print(f"P-value: {ks_p_value:.4f}")

    # Print Mann-Whitney U test results
    print("\nMann-Whitney U Test Results:")
    print(f"U Statistic: {mw_stat:.4f}")
    print(f"P-value: {mw_p_value:.4f}")

    # Interpret the results
    alpha = 0.005
    print("\nInterpretation:")
    if ks_p_value < alpha:
        print("KS Test: Statistically significant difference in average difficulty between FL and CA.")
    else:
        print("KS Test: No statistically significant difference in average difficulty between FL and CA.")

    if mw_p_value < alpha:
        print("Mann-Whitney U Test: Statistically significant difference in average difficulty between FL and CA.")
    else:
        print("Mann-Whitney U Test: No statistically significant difference in average difficulty between FL and CA.")

    # Visualize the distributions
    plt.figure(figsize=(10, 6))
    sns.kdeplot(fl_difficulty, label='Florida (FL)', shade=True, color='blue')
    sns.kdeplot(ca_difficulty, label='California (CA)', shade=True, color='red')
    plt.title('Average Difficulty Distribution: FL vs. CA', fontsize=16)
    plt.xlabel('Average Difficulty', fontsize=14)
    plt.ylabel('Density', fontsize=14)
    plt.legend(title='State', fontsize=12)
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()

     Average Rating  Average Difficulty  Number of Ratings  Received a Pepper  \
45              4.2                 1.8               26.0                1.0   
118             3.2                 3.9               29.0                0.0   
123             5.0                 1.9               60.0                1.0   
169             4.8                 2.6               26.0                0.0   
198             3.0                 3.8               26.0                0.0   

     Proportion Take Again  Number of Online Ratings  Male  Tough grader  \
45                    57.0                       8.0     1             0   
118                   50.0                       0.0     1            14   
123                  100.0                       4.0     0             2   
169                  100.0                       0.0     1             0   
198                   70.0                       0.0     0            12   

     Good feedback  Respected  ...  Test heavy  Graded by few things  \
45               1          0  ...           0                     2   
118              3          2  ...           0                     0   
123             12         22  ...           1                     1   
169              3         13  ...           1                     0   
198              1          1  ...           5                     4   

     Amazing lectures  Caring  Extra credit  Group projects  Lecture heavy  \
45                 11       1             2               0              2   
118                 2       5             9               0              2   
123                 8      36             4               0              1   
169                12       9             0               0              0   
198                 4       0             0               2              8   

                      Major                               University  State  
45                  History                    Pasadena City College     CA  
118               Chemistry  University of Colorado Colorado Springs     CO  
123  MathDevelopmental Math                  Lone Star College (all)     TX  
169             Mathematics      University of Wisconsin - Milwaukee     WI  
198                 Biology                 University of Louisville     KY  

[5 rows x 30 columns]
dataset shape: (968, 30)
Distribution of Entries by State:
CA    194
TX    117
FL    113
NY     46
AZ     44
GA     35
VA     28
MI     25
ON     25
UT     23
OH     21
WA     19
IL     18
NJ     18
BC     17
AL     16
NV     16
NC     14
PA     13
MN     12
LA     12
AB     12
SC     11
WI     11
ID     10
TN     10
MD     10
MO      9
MA      9
IN      8
QC      6
OK      6
CT      5
AR      4
HI      4
SK      3
NE      3
CO      2
RI      2
NM      2
KS      2
KY      2
NS      2
DC      1
MS      1
WV      1
MB      1
DE      1
AK      1
VT      1
IA      1
OR      1
Name: State, dtype: int64

Kolmogorov-Smirnov Test Results:
KS Statistic: 0.1744
P-value: 0.0222

Mann-Whitney U Test Results:
U Statistic: 8885.5000
P-value: 0.0056

Interpretation:
KS Test: No statistically significant difference in average difficulty between FL and CA.
Mann-Whitney U Test: No statistically significant difference in average difficulty between FL and CA.