# libraries 
import pandas as pd
import numpy as np
import statsmodels.api as sm
from scipy import optimize
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from numba import jit


# Data

movies_df = pd.read_csv("C:/Users/ntybl/OneDrive/Documents/computation for econ/movies.csv")

# Visualize 
movies_df.head()


# Here i will select the variables i will be using for my model. 
movies_df = movies_df[['tomatometer_status', 'tomatometer_count', 'audience_count','audience_rating']]

# I will also add intercept 
movies_df['intercept']=1
movies_df.head()


# Check for null values
nan_values = movies_df.isna().sum()

print(nan_values)

tomatometer_status     44
tomatometer_count      44
audience_count        297
audience_rating       296
intercept               0
dtype: int64


# Ive decided to remove the nan observations of tomatometer_status and tomatometer_count and replace with the mean the nans of audience_rating and audience_count  

# Drop rows containing nan values
movies_df.dropna(subset=['tomatometer_status', 'tomatometer_count'], inplace=True)

# Replace nan values with mean of each column 

movies_df['audience_rating'].fillna(movies_df['audience_rating'].mean(), inplace=True)
movies_df['audience_count'].fillna(movies_df['audience_count'].mean(), inplace=True)


# Check if the changes were made successfully.  
nan_values = movies_df.isna().sum()

print(nan_values)

tomatometer_status    0
tomatometer_count     0
audience_count        0
audience_rating       0
intercept             0
dtype: int64


# Now I will change the values in tomatometer_status from categorical to into binary.
movies_df['tomatometer_status'] = pd.get_dummies(movies_df['tomatometer_status'])['Fresh']
movies_df.head()


# Here I change the true and false values into numerical values (1 and 0).

movies_df['tomatometer_status'] = movies_df['tomatometer_status'].astype(int)
movies_df


# To finish with cleaning my data I will standardize the numerical variables.

# Select columns that will be standardize
s_columns = ['tomatometer_count', 'audience_count', 'audience_rating']

# Initialize StandardScaler
scaler = StandardScaler()

# Standardize
movies_df[s_columns] = scaler.fit_transform(movies_df[s_columns])

movies_df.head()


# Distribution of audience_rating
plt.hist(movies_df['audience_rating'], bins=10, color='tomato', edgecolor='black')
plt.xlabel('audience_rating')
plt.ylabel('Frequency')
plt.title('Distribution of audience_rating')
plt.show()


# Scatter Plot between tomatometer_count and audience_count

plt.figure(figsize=(12, 10))

plt.scatter(movies_df['tomatometer_count'], movies_df['audience_count'], color='tomato', marker='o')

# Add labels and title
plt.xlabel('Tomatometer Count')
plt.ylabel('Audience Count')
plt.title('Scatter Plot: Tomatometer Count vs. Audience Count')

# Show the plot
plt.show()


# MLE using packege 
p_logit=sm.MNLogit(movies_df['tomatometer_status'], movies_df[['intercept', 'tomatometer_count', 'audience_rating', 'audience_count']]).fit()
p_logit.summary()

Optimization terminated successfully.
         Current function value: 0.501130
         Iterations 6


# Now i will change my data into numpy array since i will show a coded Multinomial Logit 

df_m = np.array(movies_df[['intercept', 'tomatometer_count', 'audience_count', 'audience_rating', 'tomatometer_status']])
df_m

array([[ 1.        ,  1.34361176,  0.06305888, -0.37067002,  0.        ],
       [ 1.        ,  1.24122485, -0.07564333,  0.16892266,  1.        ],
       [ 1.        , -0.48472599, -0.07386705, -0.37067002,  1.        ],
       ...,
       [ 1.        , -0.68949982, -0.07817238,  1.24810801,  1.        ],
       [ 1.        , -0.49935269, -0.06500907,  1.49337741,  1.        ],
       [ 1.        , -0.71875322, -0.07970135,  0.0708149 ,  0.        ]])


# Coded MLE
# Write the sigmoid
@jit
def sigmoid(data, beta):
    
    Xb = np.dot(data, beta)
    
    eXb = np.exp(Xb)
    eXb = eXb /eXb.sum(1)[:, None]
    return eXb


# log likelihood function for multinomial logit
@jit

def LogL(params, *args):
    y, X, n_params, n_classes = args[0], args[1], args[2], args[3]
    beta = params
    
    beta = np.array(beta).reshape(n_params, -1, order='F') #Reshape so the elements can be multipled correctly. 
 
    beta[:,0] = [0]*n_params #This is so only J-1 parameter sets are fitted.
    
    d = pd.get_dummies(y).to_numpy()
    
    probs = sigmoid(X, beta)
    logged = np.log(probs)
    ll = d * logged
    
    return -np.sum(ll)


# Define parameters and starting values
n_params = 4
n_classes = 2

starting_values = np.random.rand(n_params*n_classes)

# Optimize
optimize.minimize(LogL, x0 = starting_values, args = (df_m[:, -1], df_m[:, :-1], n_params, n_classes))

  message: Optimization terminated successfully.
  success: True
   status: 0
      fun: 8853.957838393313
        x: [ 7.808e-01  6.204e-01  4.722e-02  2.690e-01  3.772e-01
            -4.434e-02  1.097e-03  1.516e+00]
      nit: 14
      jac: [ 0.000e+00  0.000e+00  0.000e+00  0.000e+00  0.000e+00
             0.000e+00  0.000e+00  0.000e+00]
 hess_inv: [[ 1.000e+00  0.000e+00 ...  0.000e+00  0.000e+00]
            [ 0.000e+00  1.000e+00 ...  0.000e+00  0.000e+00]
            ...
            [ 0.000e+00  0.000e+00 ...  2.840e-04  4.325e-06]
            [ 0.000e+00  0.000e+00 ...  4.325e-06  5.919e-04]]
     nfev: 216
     njev: 24


# Calculate marginal effects
p_logit.get_margeff(at='mean').summary()

	rotten_tomatoes_link	movie_title	movie_info	critics_consensus	content_rating	genres	directors	authors	actors	original_release_date	...	production_company	tomatometer_status	tomatometer_rating	tomatometer_count	audience_status	audience_rating	audience_count	tomatometer_top_critics_count	tomatometer_fresh_critics_count	tomatometer_rotten_critics_count
0	m/0814255	Percy Jackson & the Olympians: The Lightning T...	Always trouble-prone, the life of teenager Per...	Though it may seem like just another Harry Pot...	PG	Action & Adventure, Comedy, Drama, Science Fic...	Chris Columbus	Craig Titley, Chris Columbus, Rick Riordan	Logan Lerman, Brandon T. Jackson, Alexandra Da...	02/12/2010	...	20th Century Fox	Rotten	49.0	149.0	Spilled	53.0	254421.0	43	73	76
1	m/0878835	Please Give	Kate (Catherine Keener) and her husband Alex (...	Nicole Holofcener's newest might seem slight i...	R	Comedy	Nicole Holofcener	Nicole Holofcener	Catherine Keener, Amanda Peet, Oliver Platt, R...	04/30/2010	...	Sony Pictures Classics	Fresh	87.0	142.0	Upright	64.0	11574.0	44	123	19
2	m/10	10	A successful, middle-aged Hollywood songwriter...	Blake Edwards' bawdy comedy may not score a pe...	R	Comedy, Romance	Blake Edwards	Blake Edwards	Dudley Moore, Bo Derek, Julie Andrews, Robert ...	10/05/1979	...	Waner Bros.	Fresh	67.0	24.0	Spilled	53.0	14684.0	2	16	8
3	m/1000013-12_angry_men	12 Angry Men (Twelve Angry Men)	Following the closing arguments in a murder tr...	Sidney Lumet's feature debut is a superbly wri...	NR	Classics, Drama	Sidney Lumet	Reginald Rose	Martin Balsam, John Fiedler, Lee J. Cobb, E.G....	04/13/1957	...	Criterion Collection	Fresh	100.0	54.0	Upright	97.0	105386.0	6	54	0
4	m/1000079-20000_leagues_under_the_sea	20,000 Leagues Under The Sea	In 1866, Professor Pierre M. Aronnax (Paul Luk...	One of Disney's finest live-action adventures,...	G	Action & Adventure, Drama, Kids & Family	Richard Fleischer	Earl Felton	James Mason, Kirk Douglas, Paul Lukas, Peter L...	01/01/1954	...	Disney	Fresh	89.0	27.0	Upright	74.0	68918.0	5	24	3

	tomatometer_status	tomatometer_count	audience_count	audience_rating	intercept
0	Rotten	149.0	254421.0	53.0	1
1	Fresh	142.0	11574.0	64.0	1
2	Fresh	24.0	14684.0	53.0	1
3	Fresh	54.0	105386.0	97.0	1
4	Fresh	27.0	68918.0	74.0	1

	tomatometer_status	tomatometer_count	audience_count	audience_rating	intercept
0	False	149.0	254421.0	53.0	1
1	True	142.0	11574.0	64.0	1
2	True	24.0	14684.0	53.0	1
3	True	54.0	105386.0	97.0	1
4	True	27.0	68918.0	74.0	1

	tomatometer_status	tomatometer_count	audience_count	audience_rating	intercept
0	0	149.0	254421.0	53.0	1
1	1	142.0	11574.0	64.0	1
2	1	24.0	14684.0	53.0	1
3	1	54.0	105386.0	97.0	1
4	1	27.0	68918.0	74.0	1
...	...	...	...	...	...
17707	0	9.0	1195.0	74.0	1
17708	1	291.0	101511.0	92.0	1
17709	1	10.0	7146.0	86.0	1
17710	1	23.0	30193.0	91.0	1
17711	0	8.0	4469.0	62.0	1

	tomatometer_status	tomatometer_count	audience_count	audience_rating	intercept
0	0	1.343612	0.063059	-0.370670	1
1	1	1.241225	-0.075643	0.168923	1
2	1	-0.484726	-0.073867	-0.370670	1
3	1	-0.045925	-0.022063	1.787701	1
4	1	-0.440846	-0.042891	0.659461	1

Fresh or Rotten? A Data-Driven Dive into Movie Evaluation on Rotten Tomatoes¶

Natalia Blanco¶

Intro¶

Data Cleaning¶

Data visualization¶

Multinomial Logit¶

Interpretation¶

tomatometer_status=0 (Rotten):¶

tomatometer_status=1 (Fresh):¶

Insights¶

Conclusions¶

References¶

Dep. Variable:	tomatometer_status	No. Observations:	17668
Model:	MNLogit	Df Residuals:	17664
Method:	MLE	Df Model:	3
Date:	Fri, 15 Dec 2023	Pseudo R-squ.:	0.2661
Time:	01:55:42	Log-Likelihood:	-8854.0
converged:	True	LL-Null:	-12064.
Covariance Type:	nonrobust	LLR p-value:	0.000

tomatometer_status=1	coef	std err	z	P>\|z\|	[0.025	0.975]
intercept	0.3772	0.019	20.233	0.000	0.341	0.414
tomatometer_count	-0.0443	0.019	-2.337	0.019	-0.082	-0.007
audience_rating	1.5164	0.023	64.641	0.000	1.470	1.562
audience_count	0.0011	0.017	0.064	0.949	-0.033	0.035

tomatometer_status=0	dy/dx	std err	z	P>\|z\|	[0.025	0.975]
tomatometer_count	0.0107	0.005	2.337	0.019	0.002	0.020
audience_rating	-0.3659	0.006	-64.549	0.000	-0.377	-0.355
audience_count	-0.0003	0.004	-0.064	0.949	-0.008	0.008
tomatometer_status=1	dy/dx	std err	z	P>\|z\|	[0.025	0.975]
tomatometer_count	-0.0107	0.005	-2.337	0.019	-0.020	-0.002
audience_rating	0.3659	0.006	64.549	0.000	0.355	0.377
audience_count	0.0003	0.004	0.064	0.949	-0.008	0.008