import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.compat import lzip
import statsmodels.stats.api as sms

from statsmodels.stats.diagnostic import het_breuschpagan

import seaborn as sns


df = pd.read_csv('keqiang.csv')
# Some of our names kinda suck so let's fix them
df.rename(columns={'Democracy Index': 'democracy'},inplace = True)
# this makes all of our quantitative vars lowercase, our qualitative ones capitalized, and y var in ALL CAPS

# as mentioned in EDA post, we also need to logarithm energy_cons, rail_cargo, and GDP.
df['GDP'] = [math.log(item) for item in df['GDP']]
df['rail_cargo'] = [math.log(item) for item in df['rail_cargo']]
df['energy_cons'] = [math.log(item) for item in df['energy_cons']]
df.rename(columns={'energy_cons': 'log_energy_cons',
                   'rail_cargo': 'log_rail_cargo',
                   'GDP': 'log_GDP'},inplace = True)
df.head()


#fit regression model
fit = smf.ols('log_GDP ~ log_energy_cons+log_rail_cargo', data=df).fit()

#view model summary
print(fit.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                log_GDP   R-squared:                       0.882
Model:                            OLS   Adj. R-squared:                  0.881
Method:                 Least Squares   F-statistic:                     3046.
Date:                Sun, 17 Dec 2023   Prob (F-statistic):               0.00
Time:                        18:33:04   Log-Likelihood:                -829.49
No. Observations:                 820   AIC:                             1665.
Df Residuals:                     817   BIC:                             1679.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          22.0179      0.090    244.318      0.000      21.841      22.195
log_energy_cons     1.0960      0.020     55.939      0.000       1.058       1.134
log_rail_cargo     -0.0542      0.014     -3.808      0.000      -0.082      -0.026
==============================================================================
Omnibus:                       21.372   Durbin-Watson:                   1.409
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               22.832
Skew:                          -0.365   Prob(JB):                     1.10e-05
Kurtosis:                       3.367   Cond. No.                         39.4
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


#fit regression model
fit = smf.ols("log_GDP ~ (log_energy_cons+log_rail_cargo)*(urban_per+service_per+democracy)-(urban_per+service_per+democracy)", data=df).fit()

#view model summary
print(fit.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                log_GDP   R-squared:                       0.923
Model:                            OLS   Adj. R-squared:                  0.922
Method:                 Least Squares   F-statistic:                     1217.
Date:                Sun, 17 Dec 2023   Prob (F-statistic):               0.00
Time:                        18:33:04   Log-Likelihood:                -652.94
No. Observations:                 820   AIC:                             1324.
Df Residuals:                     811   BIC:                             1366.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
===============================================================================================
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
Intercept                      22.1749      0.076    293.437      0.000      22.027      22.323
log_energy_cons                 0.8845      0.091      9.768      0.000       0.707       1.062
log_rail_cargo                 -0.1811      0.046     -3.937      0.000      -0.271      -0.091
log_energy_cons:urban_per      -0.0066      0.001     -6.163      0.000      -0.009      -0.005
log_energy_cons:service_per     0.0072      0.002      3.762      0.000       0.003       0.011
log_energy_cons:democracy       0.0292      0.012      2.531      0.012       0.007       0.052
log_rail_cargo:urban_per        0.0039      0.001      7.638      0.000       0.003       0.005
log_rail_cargo:service_per     -0.0015      0.001     -1.691      0.091      -0.003       0.000
log_rail_cargo:democracy       -0.0057      0.006     -1.035      0.301      -0.017       0.005
==============================================================================
Omnibus:                       16.319   Durbin-Watson:                   1.711
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               22.052
Skew:                          -0.212   Prob(JB):                     1.63e-05
Kurtosis:                       3.682   Cond. No.                     4.92e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.92e+03. This might indicate that there are
strong multicollinearity or other numerical problems.


df['fitted'] = fit.fittedvalues
df['resid'] = fit.resid
sns.lmplot(data = df, x = 'fitted', y = 'resid')
plt.show()


# Histogram of residuals
sns.histplot(x = fit.resid)
plt.show()


# QQ plot
sm.qqplot(fit.resid, line='s')
plt.show()


lmm_inter = smf.mixedlm("log_GDP ~ Year +(log_energy_cons+log_rail_cargo)", df, groups=df["Country"],
                        re_formula="~1").fit()

print(lmm_inter.summary())

           Mixed Linear Model Regression Results
===========================================================
Model:              MixedLM   Dependent Variable:   log_GDP
No. Observations:   820       Method:               REML   
No. Groups:         93        Scale:                0.0299 
Min. group size:    1         Log-Likelihood:       45.2410
Max. group size:    12        Converged:            Yes    
Mean group size:    8.8                                    
-----------------------------------------------------------
                Coef.  Std.Err.   z    P>|z|  [0.025 0.975]
-----------------------------------------------------------
Intercept       -8.966    3.638 -2.465 0.014 -16.097 -1.836
Year             0.016    0.002  8.578 0.000   0.012  0.019
log_energy_cons  0.821    0.038 21.786 0.000   0.747  0.895
log_rail_cargo   0.032    0.020  1.600 0.110  -0.007  0.070
Group Var        0.484    0.479                            
===========================================================


lmm_inter = smf.mixedlm("log_GDP ~ Year + (log_energy_cons+log_rail_cargo)*(urban_per+service_per+democracy)-(urban_per+service_per+democracy)", df, groups=df["Country"],
                        re_formula="~1").fit()

print(lmm_inter.summary())

                 Mixed Linear Model Regression Results
=======================================================================
Model:                  MixedLM       Dependent Variable:       log_GDP
No. Observations:       820           Method:                   REML   
No. Groups:             93            Scale:                    0.0287 
Min. group size:        1             Log-Likelihood:           39.3498
Max. group size:        12            Converged:                Yes    
Mean group size:        8.8                                            
-----------------------------------------------------------------------
                            Coef.  Std.Err.   z    P>|z|  [0.025 0.975]
-----------------------------------------------------------------------
Intercept                   -6.317    4.014 -1.574 0.116 -14.185  1.551
Year                         0.014    0.002  7.164 0.000   0.010  0.018
log_energy_cons              0.623    0.103  6.037 0.000   0.421  0.826
log_rail_cargo              -0.013    0.050 -0.254 0.800  -0.110  0.085
log_energy_cons:urban_per   -0.006    0.002 -3.693 0.000  -0.009 -0.003
log_energy_cons:service_per  0.003    0.002  2.010 0.044   0.000  0.007
log_energy_cons:democracy    0.066    0.013  4.947 0.000   0.040  0.093
log_rail_cargo:urban_per     0.004    0.001  5.141 0.000   0.003  0.006
log_rail_cargo:service_per  -0.001    0.001 -2.089 0.037  -0.003 -0.000
log_rail_cargo:democracy    -0.025    0.006 -4.167 0.000  -0.037 -0.013
Group Var                    0.393    0.411                            
=======================================================================


sns.histplot(lmm_inter.resid)
plt.show()


# QQ plot
sm.qqplot(lmm_inter.resid, line='s')
plt.show()


df['fitted'] = lmm_inter.fittedvalues
df['resid'] = lmm_inter.resid
sns.lmplot(data = df, x = 'fitted', y = 'resid')
plt.show()


# Conduct the Breusch-Pagan test
names = ['Lagrange multiplier statistic', 'p-value',
         'f-value', 'f p-value']
 
# Get the test result
test_result = sms.het_breuschpagan(lmm_inter.resid, lmm_inter.model.exog)
 
lzip(names, test_result)

[('Lagrange multiplier statistic', 96.27687606428948),
 ('p-value', 8.889448381522433e-17),
 ('f-value', 11.972698618036382),
 ('f p-value', 7.474758802932374e-18)]


vc = df['Country'].value_counts() < 12
vc = vc[vc]

dropped_countries = df.loc[df['Country'].isin(vc.index)][['Country','Region']].drop_duplicates()
dropped_countries['Region'].value_counts()
ax = sns.countplot(data = dropped_countries, y = 'Region')
ax.set_title('Countries with missing values by region')

Text(0.5, 1.0, 'Countries with missing values by region')


vc = df['Country'].value_counts() < 4
vc = vc[vc]

dropped_countries = df.loc[df['Country'].isin(vc.index)][['Country','Region']].drop_duplicates()
dropped_countries['Region'].value_counts()
ax = sns.countplot(data = dropped_countries, y = 'Region')
ax.set_title('Countries with less than four values by region')

Text(0.5, 1.0, 'Countries with less than four values by region')

	Country	Year	Region	democracy	log_energy_cons	log_rail_cargo	urban_per	service_per	log_GDP
0	Canada	2019	North America	9.22	6.304906	13.008256	81.482	67.671163	28.187045
1	United States	2019	North America	7.96	8.291391	14.672950	82.459	77.181114	30.693523
2	Austria	2019	Western Europe	8.29	4.228670	9.986725	58.515	63.129879	26.820488
3	Finland	2019	Western Europe	9.25	4.431378	9.236982	85.446	60.194782	26.316172
4	France	2019	Western Europe	8.12	6.104842	10.430967	80.709	70.043587	28.634909

Testing the Li Keqiang Index: Analysis and Discussion

To Recap¶

Model Building¶

Model Fitting¶

Preparing Data and Packages¶

Ordinary Least Squares¶

Mixed Linear Model¶

Conclusion¶

Discussion¶

Data Issues¶

Model Issues¶