In [1]:
# Question 1: Pro male gender bias
import pandas as pd
import numpy as np
import statsmodels.api as sm
# Seed the RNG with Vaarun Muthappan's N number
N_Number = 18851315
np.random.seed(N_Number)
# Read the CSV file
num = pd.read_csv("rmpCapstoneNum.csv", names=[
'Average Rating',
'Average Difficulty',
'Number of ratings',
'Received a "pepper"?',
'Proportion of students that would take the class again',
'Number of ratings from online classes',
'Male gender',
'Female'], index_col=False)
# preprocess - remove NA, rows with <=25 ratings
num = num[num['Number of ratings'] > 25]
num = num.dropna()
# drop rows with both 1 or both 0 in 'Female' and 'Male gender' columns
num = num[~((num['Female'] == 1) & (num['Male gender'] == 1))]
num = num[~((num['Female'] == 0) & (num['Male gender'] == 0))]
# drop the 'Female' column as collinear
num = num.drop('Female', axis=1)
print(num['Male gender'].value_counts())
X_train = num[['Average Difficulty',
'Number of ratings',
'Received a "pepper"?',
'Proportion of students that would take the class again',
'Number of ratings from online classes',
'Male gender']]
y_train = num['Average Rating']
X_train = sm.add_constant(X_train)
model = sm.OLS(y_train, X_train).fit()
print(model.summary())
1 569
0 350
Name: Male gender, dtype: int64
OLS Regression Results
==============================================================================
Dep. Variable: Average Rating R-squared: 0.853
Model: OLS Adj. R-squared: 0.852
Method: Least Squares F-statistic: 880.8
Date: Thu, 02 Jan 2025 Prob (F-statistic): 0.00
Time: 18:01:18 Log-Likelihood: -267.67
No. Observations: 919 AIC: 549.3
Df Residuals: 912 BIC: 583.1
Df Model: 6
Covariance Type: nonrobust
==========================================================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------------------------------------
const 2.3765 0.092 25.862 0.000 2.196 2.557
Average Difficulty -0.1827 0.019 -9.624 0.000 -0.220 -0.145
Number of ratings 0.0002 0.000 0.503 0.615 -0.001 0.001
Received a "pepper"? 0.2203 0.027 8.179 0.000 0.167 0.273
Proportion of students that would take the class again 0.0261 0.001 40.691 0.000 0.025 0.027
Number of ratings from online classes -0.0004 0.004 -0.107 0.914 -0.007 0.007
Male gender -0.0057 0.022 -0.253 0.800 -0.050 0.038
==============================================================================
Omnibus: 207.966 Durbin-Watson: 1.869
Prob(Omnibus): 0.000 Jarque-Bera (JB): 771.921
Skew: -1.040 Prob(JB): 2.40e-168
Kurtosis: 6.979 Cond. No. 826.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [26]:
# Question 2: Gender difference in spread of ratings?
import pandas as pd
import numpy as np
from scipy import stats
# Seed the RNG with Vaarun Muthappan's N number
N_Number = 18851315
np.random.seed(N_Number)
# Run OLS with qualitative factors, reasoning: these are confounders that are in dataset
# Read the CSV file
num = pd.read_csv("rmpCapstoneNum.csv", names=[
'Average Rating',
'Average Difficulty',
'Number of ratings',
'Received a "pepper"?',
'Proportion of students that would take the class again',
'Number of ratings from online classes',
'Male gender',
'Female'], index_col=False)
# Preprocess: remove NA, rows with <=25 ratings
num = num[num['Number of ratings'] > 25]
num = num.dropna(subset=['Female', 'Male gender','Average Rating'])
# Drop rows with both 1 or both 0 in 'Female' and 'Male gender' columns
num = num[~((num['Female'] == 1) & (num['Male gender'] == 1))]
num = num[~((num['Female'] == 0) & (num['Male gender'] == 0))]
# Separate ratings by gender
male_ratings = num[num['Male gender'] == 1]['Average Rating']
female_ratings = num[num['Male gender'] == 0]['Average Rating']
print("male / female ratings",len(male_ratings), len(female_ratings))
variance_male = np.var(male_ratings, ddof=1)
variance_female = np.var(female_ratings, ddof=1)
# Levene's test
stat_levene, p_value_levene = stats.levene(male_ratings, female_ratings)
print(f"Variance of male ratings: {variance_male:.4f}")
print(f"Variance of female ratings: {variance_female:.4f}")
print(f"Levene's test statistic: {stat_levene:.4f}")
print(f"p-value: {p_value_levene:.4f}")
alpha = 0.005
if p_value_levene < alpha:
print("There is a statistically significant difference in the spread of ratings between genders.")
else:
print("There is no statistically significant difference in the spread of ratings between genders.")
male / female ratings 600 368 Variance of male ratings: 0.7194 Variance of female ratings: 0.7869 Levene's test statistic: 1.8664 p-value: 0.1722 There is no statistically significant difference in the spread of ratings between genders.
In [27]:
# Question 3: Size of these effects
import numpy as np
from scipy import stats
# Calculate means
male_mean = np.mean(male_ratings)
female_mean = np.mean(female_ratings)
# Calculate standard deviations
male_std = np.std(male_ratings, ddof=1)
female_std = np.std(female_ratings, ddof=1)
# Calculate pooled standard deviation
n1, n2 = len(male_ratings), len(female_ratings)
pooled_std = np.sqrt(((n1 - 1) * male_std**2 + (n2 - 1) * female_std**2) / (n1 + n2 - 2))
# Calculate Cohen's d
cohens_d = (male_mean - female_mean) / pooled_std
print(f"Cohen's d: {cohens_d}")
# Calculate degrees of freedom
df = len(male_ratings) + len(female_ratings) - 2
# Calculate standard error of d
se_d = np.sqrt((len(male_ratings) + len(female_ratings)) / (len(male_ratings) * len(female_ratings)) +
cohens_d**2 / (2 * (len(male_ratings) + len(female_ratings))))
# Calculate t-value for 95% CI
t_value = stats.t.ppf(0.975, df) # 0.975 for two-tailed 95% CI
# Calculate lower and upper bounds of CI
ci_lower = cohens_d - t_value * se_d
ci_upper = cohens_d + t_value * se_d
print(f"95% Confidence Interval for Cohen's d: [{ci_lower:.3f}, {ci_upper:.3f}]")
# For gender bias in spread of average rating
pooled_std = np.sqrt((len(male_ratings) - 1) * variance_male + (len(female_ratings) - 1) * variance_female) / (len(male_ratings) + len(female_ratings) - 2)
cohens_d_spread = (np.sqrt(variance_male) - np.sqrt(variance_female)) / pooled_std
# Calculate 95% confidence interval for Cohen's d
n1, n2 = len(male_ratings), len(female_ratings)
se = np.sqrt((n1 + n2) / (n1 * n2) + cohens_d_spread**2 / (2 * (n1 + n2)))
ci_lower_spread = cohens_d_spread - 1.96 * se
ci_upper_spread = cohens_d_spread + 1.96 * se
print(f"Cohen's d for gender bias in spread of average rating: {cohens_d_spread:.4f}")
print(f"95% CI: [{ci_lower_spread:.4f}, {ci_upper_spread:.4f}]")
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# Combine the data into a single dataset for plotting
ratings = [male_ratings, female_ratings]
labels = ['Male Ratings', 'Female Ratings']
# Create the violin plot
plt.figure(figsize=(8, 6))
sns.violinplot(data=ratings, palette='Set2')
plt.xticks(ticks=[0, 1], labels=labels)
plt.title('Violin Plot of Male and Female Ratings')
plt.ylabel('Ratings')
plt.xlabel('')
plt.show()
Cohen's d: 0.10658257253576335 95% Confidence Interval for Cohen's d: [-0.023, 0.237] Cohen's d for gender bias in spread of average rating: -1.4017 95% CI: [-1.5458, -1.2577]
In [28]:
# Question 4: Gender dif. in each of 20 tags
import pandas as pd
import numpy as np
from scipy import stats
N_Number = 18851315
np.random.seed(N_Number)
# Read the CSV files
tags = pd.read_csv("rmpCapstoneTags.csv", names=[
"Tough grader",
"Good feedback",
"Respected",
"Lots to read",
"Participation matters",
"Don't skip class or you will not pass",
"Lots of homework",
"Inspirational",
"Pop quizzes!",
"Accessible",
"So many papers",
"Clear grading",
"Hilarious",
"Test heavy",
"Graded by few things",
"Amazing lectures",
"Caring",
"Extra credit",
"Group projects",
"Lecture heavy"], index_col=False)
num = pd.read_csv("rmpCapstoneNum.csv", names=[
'Average Rating',
'Average Difficulty',
'Number of ratings',
'Received a "pepper"?',
'Proportion of students that would take the class again',
'Number of ratings from online classes',
'Male gender',
'Female'], index_col=False)
merged_df = pd.concat([num[['Male gender', 'Female', 'Number of ratings']], tags], axis=1)
# Preprocess: remove NA, rows with <=25 ratings
merged_df = merged_df[merged_df['Number of ratings'] > 25]
merged_df = merged_df.dropna(subset=tags.columns)
merged_df = merged_df.dropna(subset=['Number of ratings','Male gender'])
# Drop rows with both 1 or both 0 in 'Female' and 'Male gender' columns
merged_df = merged_df[~((merged_df['Female'] == 1) & (merged_df['Male gender'] == 1))]
merged_df = merged_df[~((merged_df['Female'] == 0) & (merged_df['Male gender'] == 0))]
# NORMALISE
tag_columns = tags.columns
for col in tag_columns:
merged_df[col] = merged_df[col] / merged_df['Number of ratings']
# Drop the 'Female' column as collinear
merged_df = merged_df.drop(['Female','Number of ratings'], axis=1)
print(merged_df['Male gender'].value_counts())
# Store results
results = []
for tag in tag_columns:
male_values = merged_df[merged_df['Male gender'] == 1][tag]
female_values = merged_df[merged_df['Male gender'] == 0][tag]
stat, p_value = stats.mannwhitneyu(male_values, female_values, alternative='two-sided')
results.append((tag, stat,p_value))
# Convert results to DataFrame for easy sorting
results_df = pd.DataFrame(results, columns=['Tag', 'stat','p-value'])
# Sort by p-value
sorted_results = results_df.sort_values(by='p-value')
# Display results
print("Most Gendered Tags (Lowest p-values):")
print(sorted_results.head(3))
print("\nLeast Gendered Tags (Highest p-values):")
print(sorted_results.tail(3))
1 600
0 368
Name: Male gender, dtype: int64
Most Gendered Tags (Lowest p-values):
Tag stat p-value
12 Hilarious 140543.5 7.343214e-13
2 Respected 129805.5 4.271263e-06
17 Extra credit 93699.0 5.935096e-05
Least Gendered Tags (Highest p-values):
Tag stat p-value
8 Pop quizzes! 108246.5 0.535528
19 Lecture heavy 112616.5 0.595073
0 Tough grader 110058.0 0.935189
In [29]:
# Question 5: Gender dif. in terms of average difficulty?
import pandas as pd
import numpy as np
from scipy import stats
N_Number = 18851315
np.random.seed(N_Number)
# Read the CSV files
num = pd.read_csv("rmpCapstoneNum.csv", names=[
'Average Rating',
'Average Difficulty',
'Number of ratings',
'Received a "pepper"?',
'Proportion of students that would take the class again',
'Number of ratings from online classes',
'Male gender',
'Female'], index_col=False)
# Preprocess: remove NA, rows with <=25 ratings
num = num[num['Number of ratings'] > 25]
num = num.dropna(subset=['Female', 'Male gender','Average Rating','Average Difficulty'])
# Drop rows with both 1 or both 0 in 'Female' and 'Male gender' columns
num = num[~((num['Female'] == 1) & (num['Male gender'] == 1))]
num = num[~((num['Female'] == 0) & (num['Male gender'] == 0))]
# Separate ratings by gender
male_difficulty = num[num['Male gender'] == 1]['Average Difficulty']
female_difficulty = num[num['Male gender'] == 0]['Average Difficulty']
print(num['Male gender'].value_counts())
# Perform the KS test
ks_statistic, p_value = stats.mannwhitneyu(male_difficulty, female_difficulty)
# Print the results
print(f"Mann Whitney U test Statistic: {ks_statistic}")
print(f"P-value: {p_value}")
if p_value < 0.005:
print("There is a statistically significant difference in the distributions of average difficulty between male and female professors.")
else:
print("There is no statistically significant difference in the distributions of average difficulty between male and female professors.")
1 600 0 368 Name: Male gender, dtype: int64 Mann Whitney U test Statistic: 112107.5 P-value: 0.6857788248287642 There is no statistically significant difference in the distributions of average difficulty between male and female professors.
In [30]:
# Question 6: Size of above effect at 95% confidence interval
from scipy import stats
import numpy as np
# Calculate Cohen's d
def cohens_d(group1, group2):
n1, n2 = len(group1), len(group2)
var1, var2 = np.var(group1, ddof=1), np.var(group2, ddof=1)
pooled_se = np.sqrt(((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2))
return (np.mean(group1) - np.mean(group2)) / pooled_se
# Calculate Cohen's d
d = cohens_d(male_difficulty, female_difficulty)
print(len(male_difficulty), len(female_difficulty))
# Calculate standard error of d
n1, n2 = len(male_difficulty), len(female_difficulty)
se_d = np.sqrt((n1 + n2) / (n1 * n2) + d**2 / (2 * (n1 + n2)))
# Calculate 95% confidence interval
ci_lower = d - 1.96 * se_d
ci_upper = d + 1.96 * se_d
print(f"Cohen's d: {d:.4f}")
print(f"95% Confidence Interval: [{ci_lower:.4f}, {ci_upper:.4f}]")
600 368 Cohen's d: 0.0289 95% Confidence Interval: [-0.1009, 0.1587]
In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
# seed for reproducibility
N_Number = 18851315
np.random.seed(N_Number)
# columns to use
numerical_cols = [
'Average Rating',
'Average Difficulty',
'Number of ratings',
'Received a "pepper"?',
'Proportion of students that would take the class again',
'Number of ratings from online classes',
'Male gender',
'Female'
]
# load and preprocess data
num = num[num['Number of ratings'] > 25] # filter rows with sufficient ratings
num = num.dropna() # drop rows with missing values
# enforce mutual exclusivity in gender columns
num = num[~((num['Female'] == 1) & (num['Male gender'] == 1))]
num = num[~((num['Female'] == 0) & (num['Male gender'] == 0))]
# drop one of the gender columns (avoid redundancy)
num = num.drop('Female', axis=1)
X = num.drop('Average Rating', axis=1)
y = num['Average Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=N_Number)
# standardize predictors
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
def print_performance(model, X_test, y_test, model_name):
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"{model_name} - test set performance:")
print(f"r-squared: {r2:.4f}")
print(f"rmse: {rmse:.4f}\n")
# ols regression
ols = LinearRegression()
ols.fit(X_train_scaled, y_train)
# print ols coefficients
ols_coefficients = pd.DataFrame({
'Feature': X_train.columns,
'Coefficient': ols.coef_
}).sort_values(by='Coefficient', key=abs, ascending=False)
print("\nols coefficients:")
print(ols_coefficients)
print_performance(ols, X_test_scaled, y_test, "ols")
# ridge regression
ridge_cv = GridSearchCV(Ridge(), {'alpha': [0.01, 0.1, 1.0, 10.0, 100.0, 200.0]}, cv=5)
ridge_cv.fit(X_train_scaled, y_train)
print(f"best ridge alpha: {ridge_cv.best_params_['alpha']:.4f}")
ridge_coefficients = pd.DataFrame({
'Feature': X_train.columns,
'Coefficient': ridge_cv.best_estimator_.coef_
}).sort_values(by='Coefficient', key=abs, ascending=False)
print("\nridge coefficients:")
print(ridge_coefficients)
print_performance(ridge_cv, X_test_scaled, y_test, "ridge")
# lasso regression
lasso_cv = GridSearchCV(Lasso(), {'alpha': [0.01, 0.1, 1.0, 10.0, 100.0, 200.0]}, cv=5)
lasso_cv.fit(X_train_scaled, y_train)
print(f"best lasso alpha: {lasso_cv.best_params_['alpha']:.4f}")
lasso_coefficients = pd.DataFrame({
'Feature': X_train.columns,
'Coefficient': lasso_cv.best_estimator_.coef_
})
lasso_nonzero = lasso_coefficients[lasso_coefficients['Coefficient'] != 0].sort_values(by='Coefficient', key=abs, ascending=False)
print("\nlasso non-zero coefficients:")
print(lasso_nonzero)
print_performance(lasso_cv, X_test_scaled, y_test, "lasso")
ols coefficients:
Feature Coefficient
3 Proportion of students that would take the cla... 0.610126
0 Average Difficulty -0.126961
2 Received a "pepper"? 0.121107
5 Male gender -0.009086
1 Number of ratings 0.009050
4 Number of ratings from online classes -0.003414
ols - test set performance:
r-squared: 0.8723
rmse: 0.3107
best ridge alpha: 10.0000
ridge coefficients:
Feature Coefficient
3 Proportion of students that would take the cla... 0.596541
0 Average Difficulty -0.131580
2 Received a "pepper"? 0.125409
1 Number of ratings 0.009700
5 Male gender -0.007660
4 Number of ratings from online classes -0.002964
ridge - test set performance:
r-squared: 0.8713
rmse: 0.3119
best lasso alpha: 0.0100
lasso non-zero coefficients:
Feature Coefficient
3 Proportion of students that would take the cla... 0.606264
0 Average Difficulty -0.121337
2 Received a "pepper"? 0.115826
lasso - test set performance:
r-squared: 0.8729
rmse: 0.3099
In [32]:
# Question 8: Rating vs tags
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
import random
from statsmodels.stats.outliers_influence import variance_inflation_factor
N_Number = 18851315
np.random.seed(N_Number)
random.seed(N_Number)
# Read the CSV files
tags = pd.read_csv("rmpCapstoneTags.csv", names=[
"Tough grader",
"Good feedback",
"Respected",
"Lots to read",
"Participation matters",
"Don't skip class or you will not pass",
"Lots of homework",
"Inspirational",
"Pop quizzes!",
"Accessible",
"So many papers",
"Clear grading",
"Hilarious",
"Test heavy",
"Graded by few things",
"Amazing lectures",
"Caring",
"Extra credit",
"Group projects",
"Lecture heavy"], index_col=False)
num = pd.read_csv("rmpCapstoneNum.csv", names=[
'Average Rating',
'Average Difficulty',
'Number of ratings',
'Received a "pepper"?',
'Proportion of students that would take the class again',
'Number of ratings from online classes',
'Male gender',
'Female'], index_col=False)
# Compute VIF and correlation matrix for tag predictors to check collinearity
vif_data = pd.DataFrame()
vif_data["Feature"] = tags.columns # Tag column names
vif_data["VIF"] = [variance_inflation_factor(tags.values, i) for i in range(tags.shape[1])]
print("Variance Inflation Factor (VIF):")
print(vif_data)
corr_matrix = tags.corr()
print(corr_matrix)
# Merge the dataframes to drop NAN before splitting into X,Y and train/test
merged_df = pd.concat([num, tags], axis=1)
# Only keep rows with >25 ratings
merged_df = merged_df[merged_df['Number of ratings'] > 25]
merged_df = merged_df[~((merged_df['Female'] == 1) & (merged_df['Male gender'] == 1))]
merged_df = merged_df[~((merged_df['Female'] == 0) & (merged_df['Male gender'] == 0))]
# Can drop "respect" and "caring" as they have the highest correlation (>0.7) with other columns
merged_df.drop(['Average Difficulty',
'Received a "pepper"?',
'Proportion of students that would take the class again',
'Number of ratings from online classes',
'Male gender',
'Female'], axis=1, inplace=True)
tag_columns = tags.columns
for col in tag_columns:
merged_df[col] = merged_df[col] / merged_df['Number of ratings']
merged_df.drop(['Caring','Respected','Number of ratings'], axis=1, inplace=True)
merged_df = merged_df.dropna()
# Split the data into train and test sets
X = merged_df.iloc[:, 2:] # All columns from tags
y = merged_df['Average Rating']
print(merged_df.columns)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=N_Number)
# Function to print model summary and performance
def print_model_summary(model, X_test, y_test, model_name):
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"\n{model_name} Results:")
print(f"R-squared: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")
if hasattr(model, 'coef_'):
print("\nCoefficients:")
sorted_features_coeffs = sorted(zip(X.columns, model.coef_), key=lambda x: (x[1]), reverse=True)
for feature, coef in sorted_features_coeffs:
print(f"{feature}: {coef:.4f}")
# OLS Regression
ols = LinearRegression()
ols_scores = cross_val_score(ols, X_train, y_train, cv=6, scoring='neg_mean_squared_error')
ols.fit(X_train, y_train)
print_model_summary(ols, X_test, y_test, "OLS Regression")
# Scale the features (X) and target variable (y)
scaler_X = StandardScaler()
scaler_y = StandardScaler()
# Fit the scaler on the training data and transform both train and test sets
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)
# Fit the scaler on the training target and transform both train and test targets
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1)).flatten()
y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1)).flatten()
# Ridge Regression
ridge_cv = RidgeCV(alphas=[0.01, 0.1, 1.0, 10.0, 100.0, 200.0], cv=6)
ridge_cv.fit(X_train_scaled, y_train_scaled)
print(f"\nBest alpha for Ridge: {ridge_cv.alpha_:.4f}")
print_model_summary(ridge_cv, X_test_scaled, y_test_scaled, "Ridge Regression")
# LASSO Regression
lasso_cv = LassoCV(alphas=[0.01, 0.1, 1.0, 10.0, 100.0, 200.0], cv=6, random_state=N_Number)
lasso_cv.fit(X_train_scaled, y_train_scaled)
print(f"\nBest alpha for LASSO: {lasso_cv.alpha_:.4f}")
print_model_summary(lasso_cv, X_test_scaled, y_test_scaled, "LASSO Regression")
Variance Inflation Factor (VIF):
Feature VIF
0 Tough grader 2.528223
1 Good feedback 3.407101
2 Respected 4.038748
3 Lots to read 1.797317
4 Participation matters 2.199905
5 Don't skip class or you will not pass 2.544241
6 Lots of homework 1.906057
7 Inspirational 2.963519
8 Pop quizzes! 1.207973
9 Accessible 1.881011
10 So many papers 1.258024
11 Clear grading 2.358959
12 Hilarious 2.069158
13 Test heavy 1.738232
14 Graded by few things 1.422271
15 Amazing lectures 3.015469
16 Caring 3.653912
17 Extra credit 1.573000
18 Group projects 1.254131
19 Lecture heavy 1.907449
Tough grader Good feedback Respected \
Tough grader 1.000000 0.229954 0.171610
Good feedback 0.229954 1.000000 0.686683
Respected 0.171610 0.686683 1.000000
Lots to read 0.499682 0.311697 0.243321
Participation matters 0.281985 0.576367 0.498679
Don't skip class or you will not pass 0.570231 0.341158 0.373045
Lots of homework 0.527124 0.304581 0.260685
Inspirational 0.140145 0.590619 0.727672
Pop quizzes! 0.302569 0.118450 0.139579
Accessible 0.255412 0.511265 0.499878
So many papers 0.282819 0.267818 0.130270
Clear grading 0.221003 0.586601 0.550932
Hilarious 0.167575 0.448301 0.567985
Test heavy 0.465068 0.113567 0.168253
Graded by few things 0.265465 0.151854 0.180604
Amazing lectures 0.189300 0.499613 0.702187
Caring 0.208095 0.707304 0.740289
Extra credit 0.190986 0.371942 0.407863
Group projects 0.185720 0.261789 0.253329
Lecture heavy 0.493056 0.156719 0.201295
Lots to read Participation matters \
Tough grader 0.499682 0.281985
Good feedback 0.311697 0.576367
Respected 0.243321 0.498679
Lots to read 1.000000 0.390119
Participation matters 0.390119 1.000000
Don't skip class or you will not pass 0.373778 0.424883
Lots of homework 0.330470 0.325916
Inspirational 0.240350 0.474341
Pop quizzes! 0.253375 0.140631
Accessible 0.214175 0.339283
So many papers 0.284022 0.176382
Clear grading 0.278827 0.442919
Hilarious 0.211889 0.422364
Test heavy 0.282345 0.157590
Graded by few things 0.223241 0.191287
Amazing lectures 0.280375 0.433445
Caring 0.250726 0.492437
Extra credit 0.229563 0.377400
Group projects 0.158490 0.355646
Lecture heavy 0.397294 0.234684
Don't skip class or you will not pass \
Tough grader 0.570231
Good feedback 0.341158
Respected 0.373045
Lots to read 0.373778
Participation matters 0.424883
Don't skip class or you will not pass 1.000000
Lots of homework 0.502055
Inspirational 0.279040
Pop quizzes! 0.315635
Accessible 0.376565
So many papers 0.159098
Clear grading 0.429774
Hilarious 0.389384
Test heavy 0.493759
Graded by few things 0.331314
Amazing lectures 0.401750
Caring 0.384829
Extra credit 0.360939
Group projects 0.214164
Lecture heavy 0.494568
Lots of homework Inspirational \
Tough grader 0.527124 0.140145
Good feedback 0.304581 0.590619
Respected 0.260685 0.727672
Lots to read 0.330470 0.240350
Participation matters 0.325916 0.474341
Don't skip class or you will not pass 0.502055 0.279040
Lots of homework 1.000000 0.168358
Inspirational 0.168358 1.000000
Pop quizzes! 0.274497 0.092585
Accessible 0.345145 0.401629
So many papers 0.196875 0.147548
Clear grading 0.345238 0.392426
Hilarious 0.183263 0.522413
Test heavy 0.336636 0.099004
Graded by few things 0.141825 0.140856
Amazing lectures 0.193409 0.692467
Caring 0.338463 0.631511
Extra credit 0.330417 0.305918
Group projects 0.164798 0.231867
Lecture heavy 0.320914 0.128204
Pop quizzes! Accessible \
Tough grader 0.302569 0.255412
Good feedback 0.118450 0.511265
Respected 0.139579 0.499878
Lots to read 0.253375 0.214175
Participation matters 0.140631 0.339283
Don't skip class or you will not pass 0.315635 0.376565
Lots of homework 0.274497 0.345145
Inspirational 0.092585 0.401629
Pop quizzes! 1.000000 0.147522
Accessible 0.147522 1.000000
So many papers 0.058107 0.102073
Clear grading 0.146541 0.502592
Hilarious 0.124886 0.373559
Test heavy 0.183711 0.258560
Graded by few things 0.097084 0.220145
Amazing lectures 0.147970 0.429340
Caring 0.127997 0.599456
Extra credit 0.137479 0.352118
Group projects 0.069418 0.175262
Lecture heavy 0.229478 0.263272
So many papers Clear grading \
Tough grader 0.282819 0.221003
Good feedback 0.267818 0.586601
Respected 0.130270 0.550932
Lots to read 0.284022 0.278827
Participation matters 0.176382 0.442919
Don't skip class or you will not pass 0.159098 0.429774
Lots of homework 0.196875 0.345238
Inspirational 0.147548 0.392426
Pop quizzes! 0.058107 0.146541
Accessible 0.102073 0.502592
So many papers 1.000000 0.116832
Clear grading 0.116832 1.000000
Hilarious 0.104625 0.417913
Test heavy 0.036550 0.264296
Graded by few things 0.082795 0.284087
Amazing lectures 0.079873 0.469776
Caring 0.139631 0.585987
Extra credit 0.061238 0.465386
Group projects 0.106371 0.238718
Lecture heavy 0.121763 0.319024
Hilarious Test heavy \
Tough grader 0.167575 0.465068
Good feedback 0.448301 0.113567
Respected 0.567985 0.168253
Lots to read 0.211889 0.282345
Participation matters 0.422364 0.157590
Don't skip class or you will not pass 0.389384 0.493759
Lots of homework 0.183263 0.336636
Inspirational 0.522413 0.099004
Pop quizzes! 0.124886 0.183711
Accessible 0.373559 0.258560
So many papers 0.104625 0.036550
Clear grading 0.417913 0.264296
Hilarious 1.000000 0.241308
Test heavy 0.241308 1.000000
Graded by few things 0.257546 0.416451
Amazing lectures 0.634742 0.221910
Caring 0.497715 0.183579
Extra credit 0.384361 0.231502
Group projects 0.288900 0.099936
Lecture heavy 0.202682 0.475742
Graded by few things Amazing lectures \
Tough grader 0.265465 0.189300
Good feedback 0.151854 0.499613
Respected 0.180604 0.702187
Lots to read 0.223241 0.280375
Participation matters 0.191287 0.433445
Don't skip class or you will not pass 0.331314 0.401750
Lots of homework 0.141825 0.193409
Inspirational 0.140856 0.692467
Pop quizzes! 0.097084 0.147970
Accessible 0.220145 0.429340
So many papers 0.082795 0.079873
Clear grading 0.284087 0.469776
Hilarious 0.257546 0.634742
Test heavy 0.416451 0.221910
Graded by few things 1.000000 0.222411
Amazing lectures 0.222411 1.000000
Caring 0.188489 0.571739
Extra credit 0.182106 0.367060
Group projects 0.134005 0.238078
Lecture heavy 0.386136 0.238837
Caring Extra credit Group projects \
Tough grader 0.208095 0.190986 0.185720
Good feedback 0.707304 0.371942 0.261789
Respected 0.740289 0.407863 0.253329
Lots to read 0.250726 0.229563 0.158490
Participation matters 0.492437 0.377400 0.355646
Don't skip class or you will not pass 0.384829 0.360939 0.214164
Lots of homework 0.338463 0.330417 0.164798
Inspirational 0.631511 0.305918 0.231867
Pop quizzes! 0.127997 0.137479 0.069418
Accessible 0.599456 0.352118 0.175262
So many papers 0.139631 0.061238 0.106371
Clear grading 0.585987 0.465386 0.238718
Hilarious 0.497715 0.384361 0.288900
Test heavy 0.183579 0.231502 0.099936
Graded by few things 0.188489 0.182106 0.134005
Amazing lectures 0.571739 0.367060 0.238078
Caring 1.000000 0.438533 0.231486
Extra credit 0.438533 1.000000 0.200623
Group projects 0.231486 0.200623 1.000000
Lecture heavy 0.224560 0.289317 0.183903
Lecture heavy
Tough grader 0.493056
Good feedback 0.156719
Respected 0.201295
Lots to read 0.397294
Participation matters 0.234684
Don't skip class or you will not pass 0.494568
Lots of homework 0.320914
Inspirational 0.128204
Pop quizzes! 0.229478
Accessible 0.263272
So many papers 0.121763
Clear grading 0.319024
Hilarious 0.202682
Test heavy 0.475742
Graded by few things 0.386136
Amazing lectures 0.238837
Caring 0.224560
Extra credit 0.289317
Group projects 0.183903
Lecture heavy 1.000000
Index(['Average Rating', 'Tough grader', 'Good feedback', 'Lots to read',
'Participation matters', 'Don't skip class or you will not pass',
'Lots of homework', 'Inspirational', 'Pop quizzes!', 'Accessible',
'So many papers', 'Clear grading', 'Hilarious', 'Test heavy',
'Graded by few things', 'Amazing lectures', 'Extra credit',
'Group projects', 'Lecture heavy'],
dtype='object')
OLS Regression Results:
R-squared: 0.7196
RMSE: 0.4460
Coefficients:
Amazing lectures: 1.5931
Clear grading: 1.2293
Good feedback: 1.1381
Inspirational: 1.0018
Accessible: 0.7893
Hilarious: 0.7184
Extra credit: 0.6528
Participation matters: 0.3864
Pop quizzes!: 0.0779
Don't skip class or you will not pass: -0.1214
Lots to read: -0.6500
Lots of homework: -0.6567
Group projects: -0.7286
So many papers: -0.8571
Graded by few things: -0.9535
Lecture heavy: -1.0738
Test heavy: -1.1652
Best alpha for Ridge: 100.0000
Ridge Regression Results:
R-squared: 0.7230
RMSE: 0.5111
Coefficients:
Amazing lectures: 0.2430
Good feedback: 0.2185
Clear grading: 0.1562
Inspirational: 0.1444
Hilarious: 0.1392
Extra credit: 0.0973
Accessible: 0.0823
Participation matters: 0.0601
Pop quizzes!: 0.0001
Don't skip class or you will not pass: -0.0316
So many papers: -0.0458
Graded by few things: -0.0505
Group projects: -0.0643
Test heavy: -0.1018
Lots to read: -0.1137
Lots of homework: -0.1257
Lecture heavy: -0.1445
Best alpha for LASSO: 0.0100
LASSO Regression Results:
R-squared: 0.7234
RMSE: 0.5107
Coefficients:
Amazing lectures: 0.2709
Good feedback: 0.2474
Clear grading: 0.1632
Inspirational: 0.1427
Hilarious: 0.1414
Extra credit: 0.1018
Accessible: 0.0788
Participation matters: 0.0502
Pop quizzes!: 0.0000
Don't skip class or you will not pass: -0.0140
So many papers: -0.0398
Graded by few things: -0.0422
Group projects: -0.0579
Test heavy: -0.0968
Lots to read: -0.1124
Lots of homework: -0.1216
Lecture heavy: -0.1485
In [33]:
# Question 9
import numpy as np
import pandas as pd
from scipy.stats import f
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
num_data = pd.read_csv('rmpCapstoneNum.csv')
qual_data = pd.read_csv('rmpCapstoneQual.csv')
tag_data = pd.read_csv('rmpCapstoneTags.csv')
## set alpha level
alpha = 0.005
N_Number = 18851315
## same N_number, keep >25 samples, dropna, drop rows that are both 0 or both 1, drop female column
np.random.seed(N_Number)
num_data.columns = [
'Average Rating',
'Average Difficulty',
'Number of Ratings',
'Received a Pepper',
'Proportion Take Again',
'Number of Online Ratings',
'Male',
'Female'
]
# Rename columns in tag_data
tag_data.columns = [
"Tough grader",
"Good feedback",
"Respected",
"Lots to read",
"Participation matters",
"Don’t skip class or you will not pass",
"Lots of homework",
"Inspirational",
"Pop quizzes!",
"Accessible",
"So many papers",
"Clear grading",
"Hilarious",
"Test heavy",
"Graded by few things",
"Amazing lectures",
"Caring",
"Extra credit",
"Group projects",
"Lecture heavy"
]
num_data = pd.concat([num_data, tag_data, qual_data], axis=1)
num_data = num_data[num_data['Male'] != num_data['Female']]
num_data = num_data[num_data['Number of Ratings'] > 25]
num_data = num_data.drop(columns=['Female'])
# Display the first few rows of the cleaned dataset
print(num_data.head())
# Print the shape of the cleaned dataset
print(f"dataset shape: {num_data.shape}")
num_data['Male'].value_counts()
filtered_num_data = num_data.dropna()
filtered_num_data['Male'].value_counts()
filtered_num_data = num_data
filtered_num_data.shape
tag_columns = [
'Tough grader', 'Good feedback', 'Respected', 'Lots to read',
'Participation matters', 'Don’t skip class or you will not pass',
'Lots of homework', 'Inspirational', 'Pop quizzes!', 'Accessible',
'So many papers', 'Clear grading', 'Hilarious', 'Test heavy',
'Graded by few things', 'Amazing lectures', 'Caring', 'Extra credit',
'Group projects', 'Lecture heavy'
]
# Normalize tags by dividing each tag column by 'Number of Ratings'
filtered_num_data[tag_columns] = filtered_num_data[tag_columns].div(filtered_num_data['Number of Ratings'], axis=0)
# Define predictors and target variable
X = filtered_num_data[tag_columns]
y = filtered_num_data['Average Difficulty']
# Run VIF on the tag columns
vif_data = pd.DataFrame()
vif_data['Feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print("Variance Inflation Factor (VIF):")
print(vif_data)
# Drop 'Caring' and 'Respected' due to high VIF
X = X.drop(columns=['Caring', 'Respected'])
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=N_Number)
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train and Evaluate OLS Regression
ols_model = LinearRegression()
ols_model.fit(X_train_scaled, y_train)
ols_y_pred = ols_model.predict(X_test_scaled)
# Evaluate OLS
ols_r2 = r2_score(y_test, ols_y_pred)
ols_rmse = mean_squared_error(y_test, ols_y_pred, squared=False)
print(f"OLS Regression - R²: {ols_r2:.4f}, RMSE: {ols_rmse:.4f}")
# Identify most predictive tags in OLS
ols_coefficients = pd.DataFrame({
'Feature': X.columns,
'Coefficient': ols_model.coef_
}).sort_values(by='Coefficient', key=abs, ascending=False)
print("OLS Regression Coefficients:")
print(ols_coefficients)
# Train and Evaluate Ridge Regression
ridge_model = RidgeCV(alphas=[0.1, 1, 10, 100, 200], cv=5) # 5-fold cross-validation
ridge_model.fit(X_train_scaled, y_train)
ridge_y_pred = ridge_model.predict(X_test_scaled)
# Evaluate Ridge
ridge_r2 = r2_score(y_test, ridge_y_pred)
ridge_rmse = mean_squared_error(y_test, ridge_y_pred, squared=False)
print(f"Ridge Regression - Best Alpha: {ridge_model.alpha_}")
print(f"Ridge Regression - R²: {ridge_r2:.4f}, RMSE: {ridge_rmse:.4f}")
# Identify most predictive tags in Ridge
ridge_coefficients = pd.DataFrame({
'Feature': X.columns,
'Coefficient': ridge_model.coef_
}).sort_values(by='Coefficient', key=abs, ascending=False)
print("Ridge Regression Coefficients:")
print(ridge_coefficients)
# Train and Evaluate Lasso Regression
lasso_model = LassoCV(alphas=[0.1, 1, 10, 100, 200], cv=5, random_state=N_Number) # 5-fold cross-validation
lasso_model.fit(X_train_scaled, y_train)
lasso_y_pred = lasso_model.predict(X_test_scaled)
# Evaluate Lasso
lasso_r2 = r2_score(y_test, lasso_y_pred)
lasso_rmse = mean_squared_error(y_test, lasso_y_pred, squared=False)
print(f"Lasso Regression - Best Alpha: {lasso_model.alpha_}")
print(f"Lasso Regression - R²: {lasso_r2:.4f}, RMSE: {lasso_rmse:.4f}")
# Identify most predictive tags in Lasso
lasso_coefficients = pd.DataFrame({
'Feature': X.columns,
'Coefficient': lasso_model.coef_
}).sort_values(by='Coefficient', key=abs, ascending=False)
print("Lasso Regression Coefficients:")
print(lasso_coefficients)
# Compare Models
print("\nModel Comparison:")
print(f"OLS: R² = {ols_r2:.4f}, RMSE = {ols_rmse:.4f}")
print(f"Ridge: R² = {ridge_r2:.4f}, RMSE = {ridge_rmse:.4f}")
print(f"Lasso: R² = {lasso_r2:.4f}, RMSE = {lasso_rmse:.4f}")
Average Rating Average Difficulty Number of Ratings Received a Pepper \
45 4.2 1.8 26.0 1.0
118 3.2 3.9 29.0 0.0
123 5.0 1.9 60.0 1.0
169 4.8 2.6 26.0 0.0
198 3.0 3.8 26.0 0.0
Proportion Take Again Number of Online Ratings Male Tough grader \
45 57.0 8.0 1 0
118 50.0 0.0 1 14
123 100.0 4.0 0 2
169 100.0 0.0 1 0
198 70.0 0.0 0 12
Good feedback Respected ... Test heavy Graded by few things \
45 1 0 ... 0 2
118 3 2 ... 0 0
123 12 22 ... 1 1
169 3 13 ... 1 0
198 1 1 ... 5 4
Amazing lectures Caring Extra credit Group projects Lecture heavy \
45 11 1 2 0 2
118 2 5 9 0 2
123 8 36 4 0 1
169 12 9 0 0 0
198 4 0 0 2 8
Criminal Justice George Mason University VA
45 History Pasadena City College CA
118 Chemistry University of Colorado Colorado Springs CO
123 MathDevelopmental Math Lone Star College (all) TX
169 Mathematics University of Wisconsin - Milwaukee WI
198 Biology University of Louisville KY
[5 rows x 30 columns]
dataset shape: (968, 30)
Variance Inflation Factor (VIF):
Feature VIF
0 Tough grader 3.281639
1 Good feedback 3.987277
2 Respected 4.945778
3 Lots to read 2.079865
4 Participation matters 2.454495
5 Don’t skip class or you will not pass 3.163318
6 Lots of homework 1.878281
7 Inspirational 3.564262
8 Pop quizzes! 1.245734
9 Accessible 2.283465
10 So many papers 1.375255
11 Clear grading 2.856751
12 Hilarious 2.121150
13 Test heavy 2.140123
14 Graded by few things 1.828563
15 Amazing lectures 3.649584
16 Caring 5.061086
17 Extra credit 1.694762
18 Group projects 1.321729
19 Lecture heavy 2.423144
OLS Regression - R²: 0.6543, RMSE: 0.4275
OLS Regression Coefficients:
Feature Coefficient
0 Tough grader 0.310902
8 Accessible 0.137767
12 Test heavy 0.119827
10 Clear grading -0.106689
4 Don’t skip class or you will not pass 0.093906
2 Lots to read 0.090998
11 Hilarious -0.090754
15 Extra credit -0.073131
5 Lots of homework 0.064294
13 Graded by few things -0.039153
6 Inspirational -0.031628
1 Good feedback -0.020221
14 Amazing lectures 0.019113
17 Lecture heavy 0.014774
9 So many papers 0.010938
3 Participation matters -0.006292
7 Pop quizzes! 0.003608
16 Group projects -0.001595
Ridge Regression - Best Alpha: 10.0
Ridge Regression - R²: 0.6541, RMSE: 0.4276
Ridge Regression Coefficients:
Feature Coefficient
0 Tough grader 0.304869
8 Accessible 0.135161
12 Test heavy 0.118969
10 Clear grading -0.106407
4 Don’t skip class or you will not pass 0.094248
11 Hilarious -0.090628
2 Lots to read 0.090616
15 Extra credit -0.073471
5 Lots of homework 0.064203
13 Graded by few things -0.038519
6 Inspirational -0.031820
1 Good feedback -0.021367
14 Amazing lectures 0.016883
17 Lecture heavy 0.015742
9 So many papers 0.011777
3 Participation matters -0.007006
7 Pop quizzes! 0.003653
16 Group projects -0.001640
Lasso Regression - Best Alpha: 0.1
Lasso Regression - R²: 0.5415, RMSE: 0.4924
Lasso Regression Coefficients:
Feature Coefficient
0 Tough grader 0.358396
12 Test heavy 0.057843
4 Don’t skip class or you will not pass 0.044740
11 Hilarious -0.034100
10 Clear grading -0.020305
8 Accessible 0.010367
5 Lots of homework 0.006425
2 Lots to read 0.005779
16 Group projects -0.000000
15 Extra credit -0.000000
14 Amazing lectures -0.000000
13 Graded by few things 0.000000
9 So many papers 0.000000
1 Good feedback -0.000000
7 Pop quizzes! 0.000000
6 Inspirational -0.000000
3 Participation matters -0.000000
17 Lecture heavy 0.000000
Model Comparison:
OLS: R² = 0.6543, RMSE = 0.4275
Ridge: R² = 0.6541, RMSE = 0.4276
Lasso: R² = 0.5415, RMSE = 0.4924
In [34]:
# Question 10: pepper from tags + numerical
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve
import random
N_Number = 18851315
random.seed(N_Number)
np.random.seed(N_Number)
num = pd.read_csv("rmpCapstoneNum.csv", names=[
'Average Rating',
'Average Difficulty',
'Number of ratings',
'Received a "pepper"?',
'Proportion of students that would take the class again',
'Number of ratings from online classes',
'Male gender',
'Female'], index_col=False)
tags = pd.read_csv("rmpCapstoneTags.csv", names=[
"Tough grader",
"Good feedback",
"Respected",
"Lots to read",
"Participation matters",
"Don't skip class or you will not pass",
"Lots of homework",
"Inspirational",
"Pop quizzes!",
"Accessible",
"So many papers",
"Clear grading",
"Hilarious",
"Test heavy",
"Graded by few things",
"Amazing lectures",
"Caring",
"Extra credit",
"Group projects",
"Lecture heavy"], index_col=False)
# Combine the dataframes
df = pd.concat([num, tags], axis=1)
# Preprocess the data
# Keep only >25 ratings as average only makes sense then
df = df[df['Number of ratings'] > 25]
# Drop rows with both 1 or both 0 in 'Female' and 'Male gender' columns
df = df[~((df['Female'] == 1) & (df['Male gender'] == 1))]
df = df[~((df['Female'] == 0) & (df['Male gender'] == 0))]
# Drop NA and cols with colinearity, eg female, caring and respect cols
df = df.drop(['Female','Caring','Respected'], axis=1)
df = df.dropna()
# Normalize tag columns by number of ratings
tag_columns = tags.columns
for col in tag_columns:
if col != 'Caring' and col != 'Respected':
df[col] = df[col] / df['Number of ratings']
# Prepare features and target variable
X = df.drop('Received a "pepper"?', axis=1)
y = df['Received a "pepper"?']
print(X.columns)
# Display the number of values in each class of 'Received a "pepper"?'
print("Class distribution:")
print(y.value_counts())
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=N_Number, stratify=y)
num_columns = ['Average Rating',
'Average Difficulty',
'Number of ratings',
# 'Received a "pepper"?',
'Proportion of students that would take the class again',
'Number of ratings from online classes',
'Male gender']
scaler = StandardScaler()
X_train[num_columns] = scaler.fit_transform(X_train[num_columns])
X_test[num_columns] = scaler.transform(X_test[num_columns])
# Run logistic regression
model = LogisticRegression(random_state=N_Number, class_weight = 'balanced')
model.fit(X_train, y_train)
# Predict probabilities and calculate AUROC
y_pred_proba = model.predict_proba(X_test)[:, 1]
auroc = roc_auc_score(y_test, y_pred_proba)
print(f"\nAUROC: {auroc:.4f}")
print("Coefficients:")
for feature, coef in sorted(zip(X_train.columns, model.coef_[0]), key=lambda x: (x[1]), reverse=True):
print(f"{feature}: {coef:.4f}")
# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label='ROC Curve (area = {:.2f})'.format(auroc))
plt.plot([0, 1], [0, 1], color='red', linestyle='--') # Diagonal line for random guessing
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid()
plt.show()
Index(['Average Rating', 'Average Difficulty', 'Number of ratings',
'Proportion of students that would take the class again',
'Number of ratings from online classes', 'Male gender', 'Tough grader',
'Good feedback', 'Lots to read', 'Participation matters',
'Don't skip class or you will not pass', 'Lots of homework',
'Inspirational', 'Pop quizzes!', 'Accessible', 'So many papers',
'Clear grading', 'Hilarious', 'Test heavy', 'Graded by few things',
'Amazing lectures', 'Extra credit', 'Group projects', 'Lecture heavy'],
dtype='object')
Class distribution:
1.0 551
0.0 368
Name: Received a "pepper"?, dtype: int64
AUROC: 0.9085
Coefficients:
Inspirational: 1.9448
Amazing lectures: 1.5378
Average Rating: 1.4424
Hilarious: 0.6536
Proportion of students that would take the class again: 0.1912
Average Difficulty: 0.1681
Tough grader: 0.1155
Number of ratings from online classes: 0.0917
Number of ratings: -0.0257
Extra credit: -0.0497
Group projects: -0.0687
Graded by few things: -0.0710
So many papers: -0.0802
Good feedback: -0.0920
Lots to read: -0.1020
Don't skip class or you will not pass: -0.1413
Male gender: -0.2135
Participation matters: -0.2489
Test heavy: -0.2537
Clear grading: -0.2844
Pop quizzes!: -0.2846
Accessible: -0.4476
Lots of homework: -0.5069
Lecture heavy: -0.8455
In [35]:
## Extra Credit
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import pandas as pd
import sqlite3
import os
import math
from scipy.stats import mannwhitneyu
from scipy.stats import ks_2samp
from scipy.stats import kruskal
from scipy.stats import levene
from scipy.stats import f
import re
# Load the dataset
num_data = pd.read_csv('rmpCapstoneNum.csv')
qual_data = pd.read_csv('rmpCapstoneQual.csv')
tag_data = pd.read_csv('rmpCapstoneTags.csv')
## set alpha level
alpha = 0.005
N_Number = 18851315
## same N_number, keep >25 samples, dropna, drop rows that are both 0 or both 1, drop female column
np.random.seed(N_Number)
import pandas as pd
# Assign meaningful column names to num_data
num_data.columns = [
'Average Rating',
'Average Difficulty',
'Number of Ratings',
'Received a Pepper',
'Proportion Take Again',
'Number of Online Ratings',
'Male',
'Female'
]
qual_data.columns = [
"Major", "University", "State"
]
qual_data.head()
tag_data.columns = [
"Tough grader",
"Good feedback",
"Respected",
"Lots to read",
"Participation matters",
"Don’t skip class or you will not pass",
"Lots of homework",
"Inspirational",
"Pop quizzes!",
"Accessible",
"So many papers",
"Clear grading",
"Hilarious",
"Test heavy",
"Graded by few things",
"Amazing lectures",
"Caring",
"Extra credit",
"Group projects",
"Lecture heavy"
]
# Combine Gender Data with Tag Data
num_data = pd.concat([num_data, tag_data, qual_data], axis=1)
# Drop rows where both Male and Female are 0 or both are 1
num_data = num_data[num_data['Male'] != num_data['Female']]
# Filter out professors with 25 or fewer ratings
num_data = num_data[num_data['Number of Ratings'] > 25]
# Drop the 'Female' column
num_data = num_data.drop(columns=['Female'])
# Display the first few rows of the cleaned dataset
print(num_data.head())
# Print the shape of the cleaned dataset
print(f"dataset shape: {num_data.shape}")
num_data['Male'].value_counts()
filtered_num_data = num_data
filtered_num_data.shape
## check sample sizes by state
import matplotlib.pyplot as plt
import seaborn as sns
# Count the number of entries by state
state_distribution = filtered_num_data['State'].value_counts()
# Print the state distribution
print("Distribution of Entries by State:")
print(state_distribution)
# Plot the distribution
plt.figure(figsize=(12, 8))
sns.barplot(x=state_distribution.index, y=state_distribution.values, palette="viridis")
plt.title("Distribution of Entries by State", fontsize=16)
plt.xlabel("State", fontsize=14)
plt.ylabel("Number of Entries", fontsize=14)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
## do both KS and mann whitney U
from scipy.stats import ks_2samp, mannwhitneyu
import seaborn as sns
import matplotlib.pyplot as plt
# Filter data for Florida and California
fl_data = filtered_num_data[filtered_num_data['State'] == 'FL']
ca_data = filtered_num_data[filtered_num_data['State'] == 'CA']
# Extract the 'Average Difficulty' column for both states
fl_difficulty = fl_data['Average Difficulty'].dropna()
ca_difficulty = ca_data['Average Difficulty'].dropna()
# Ensure both datasets are not empty
if fl_difficulty.empty or ca_difficulty.empty:
print("One or both datasets are empty. Ensure the 'State' column has correct values.")
else:
# Perform KS test
ks_stat, ks_p_value = ks_2samp(fl_difficulty, ca_difficulty)
# Perform Mann-Whitney U test
mw_stat, mw_p_value = mannwhitneyu(fl_difficulty, ca_difficulty, alternative='two-sided')
# Print KS test results
print("Kolmogorov-Smirnov Test Results:")
print(f"KS Statistic: {ks_stat:.4f}")
print(f"P-value: {ks_p_value:.4f}")
# Print Mann-Whitney U test results
print("\nMann-Whitney U Test Results:")
print(f"U Statistic: {mw_stat:.4f}")
print(f"P-value: {mw_p_value:.4f}")
# Interpret the results
alpha = 0.005
print("\nInterpretation:")
if ks_p_value < alpha:
print("KS Test: Statistically significant difference in average difficulty between FL and CA.")
else:
print("KS Test: No statistically significant difference in average difficulty between FL and CA.")
if mw_p_value < alpha:
print("Mann-Whitney U Test: Statistically significant difference in average difficulty between FL and CA.")
else:
print("Mann-Whitney U Test: No statistically significant difference in average difficulty between FL and CA.")
# Visualize the distributions
plt.figure(figsize=(10, 6))
sns.kdeplot(fl_difficulty, label='Florida (FL)', shade=True, color='blue')
sns.kdeplot(ca_difficulty, label='California (CA)', shade=True, color='red')
plt.title('Average Difficulty Distribution: FL vs. CA', fontsize=16)
plt.xlabel('Average Difficulty', fontsize=14)
plt.ylabel('Density', fontsize=14)
plt.legend(title='State', fontsize=12)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()
Average Rating Average Difficulty Number of Ratings Received a Pepper \
45 4.2 1.8 26.0 1.0
118 3.2 3.9 29.0 0.0
123 5.0 1.9 60.0 1.0
169 4.8 2.6 26.0 0.0
198 3.0 3.8 26.0 0.0
Proportion Take Again Number of Online Ratings Male Tough grader \
45 57.0 8.0 1 0
118 50.0 0.0 1 14
123 100.0 4.0 0 2
169 100.0 0.0 1 0
198 70.0 0.0 0 12
Good feedback Respected ... Test heavy Graded by few things \
45 1 0 ... 0 2
118 3 2 ... 0 0
123 12 22 ... 1 1
169 3 13 ... 1 0
198 1 1 ... 5 4
Amazing lectures Caring Extra credit Group projects Lecture heavy \
45 11 1 2 0 2
118 2 5 9 0 2
123 8 36 4 0 1
169 12 9 0 0 0
198 4 0 0 2 8
Major University State
45 History Pasadena City College CA
118 Chemistry University of Colorado Colorado Springs CO
123 MathDevelopmental Math Lone Star College (all) TX
169 Mathematics University of Wisconsin - Milwaukee WI
198 Biology University of Louisville KY
[5 rows x 30 columns]
dataset shape: (968, 30)
Distribution of Entries by State:
CA 194
TX 117
FL 113
NY 46
AZ 44
GA 35
VA 28
MI 25
ON 25
UT 23
OH 21
WA 19
IL 18
NJ 18
BC 17
AL 16
NV 16
NC 14
PA 13
MN 12
LA 12
AB 12
SC 11
WI 11
ID 10
TN 10
MD 10
MO 9
MA 9
IN 8
QC 6
OK 6
CT 5
AR 4
HI 4
SK 3
NE 3
CO 2
RI 2
NM 2
KS 2
KY 2
NS 2
DC 1
MS 1
WV 1
MB 1
DE 1
AK 1
VT 1
IA 1
OR 1
Name: State, dtype: int64
Kolmogorov-Smirnov Test Results: KS Statistic: 0.1744 P-value: 0.0222 Mann-Whitney U Test Results: U Statistic: 8885.5000 P-value: 0.0056 Interpretation: KS Test: No statistically significant difference in average difficulty between FL and CA. Mann-Whitney U Test: No statistically significant difference in average difficulty between FL and CA.
In [ ]:
In [ ]: