Back to Article
Likert Scale Dual Response
Download Notebook
In [1]:
import arviz as az
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc as pm
import pytensor.tensor as pt
import statsmodels.api as sm

from scipy.stats import bernoulli
from statsmodels.miscmodels.ordinal_model import OrderedModel
WARNING (pytensor.tensor.blas): Using NumPy C-API based implementation for BLAS functions.
In [2]:
plt.rcParams["font.family"] = "Latin Modern Roman"
In [3]:
%config InlineBackend.figure_format = 'retina'  # high resolution figures
az.style.use("arviz-darkgrid")
rng = np.random.default_rng(42)
In [4]:
def make_data():
    np.random.seed(100)
    salary = np.random.normal(40, 10, 500)
    work_sat = np.random.beta(1, 0.4, 500)
    work_from_home = bernoulli.rvs(0.7, size=500)
    work_from_home_calc = np.where(work_from_home, 1.4 * work_from_home, work_from_home)
    latent_rating = (
        0.08423 * salary
        + 0.2 * work_sat
        + work_from_home_calc
        + np.random.normal(0, 1, 500)
    )
    explicit_rating = np.round(latent_rating, 0)
    df = pd.DataFrame(
        {
            "salary": salary,
            "work_sat": work_sat,
            "work_from_home": work_from_home,
            "latent_rating": latent_rating,
            "explicit_rating": explicit_rating,
        }
    )
    return df


try:
    df = pd.read_csv("../data/fake_employee_manger_rating.csv")
    df = df[
        ["salary", "work_sat", "work_from_home", "latent_rating", "explicit_rating"]
    ]
except FileNotFoundError:
    df = make_data()

K = len(df["explicit_rating"].unique())
df.head()
salary work_sat work_from_home latent_rating explicit_rating
0 22.502345 0.735425 1 1.376997 1.0
1 43.426804 0.570354 0 5.419863 5.0
2 51.530358 0.738212 0 5.279701 5.0
3 37.475640 0.346045 1 4.651830 5.0
4 49.813208 0.365451 1 5.788661 6.0
In [5]:
fig, axs = plt.subplots(1, 3, figsize=(15, 5))
axs = axs.flatten()
ax = axs[0]
ax.scatter(
    df["salary"],
    df["explicit_rating"],
    label="Explicit Rating",
    color="navy",
    alpha=0.3,
)
axs[1].scatter(
    df["work_from_home"],
    df["latent_rating"],
    label="Latent Rating",
    color="maroon",
    alpha=0.3,
)
axs[1].scatter(
    df["work_from_home"],
    df["explicit_rating"],
    label="Explicit Rating",
    c="navy",
    alpha=0.3,
)
axs[2].scatter(
    df["work_sat"], df["latent_rating"], label="Latent Rating", color="maroon", alpha=0.3
)
axs[2].scatter(
    df["work_sat"],
    df["explicit_rating"],
    label="Explicit Rating",
    color="navy",
    alpha=0.3,
)
ax.scatter(df["salary"], df["latent_rating"], label="Latent Sentiment", color="maroon")
ax.set_title("Manager Ratings by Salary")
axs[1].set_title("Manager Ratings by WFH")
axs[2].set_title("Manager Ratings by Work Satisfaction")
ax.set_ylabel("Latent Sentiment")
ax.set_xlabel("Employee Salary")
axs[1].set_xlabel("Work From Home: Y/N")
axs[2].set_xlabel("Employee Work Satisfaction")
axs[1].legend()

plt.show()

In [6]:
exog = sm.add_constant(df[["salary", "work_from_home", "work_sat"]])
mod = sm.OLS(df["explicit_rating"], exog)
results = mod.fit()
results.summary()

results.predict([1, 200, 1, 0.6])
fig, axs = plt.subplots(1, 2, figsize=(20, 6))
axs = axs.flatten()
ax = axs[1]
salaries = np.linspace(10, 125, 20)
predictions = [results.predict([1, i, 1, 0.6])[0] for i in salaries]
ax.plot(salaries, predictions, label="Implied Linear function of Salaries on Outcome")
ax.set_title("Out of bound Prediction based on Salary")
ax.axhline(10, linestyle="--", color="black")
ax.set_xlabel("Hypothetical Salary")
ax.set_ylabel("Manager Rating Scale")
ax.axhline(0, linestyle="--", color="black")
axs[0].hist(results.resid, ec="white")
axs[0].set_title("Simple OLS Residuals on Training Data")
Text(0.5, 1.0, 'Simple OLS Residuals on Training Data')

In [7]:
def constrainedUniform(N, min=0, max=1):
    return pm.Deterministic(
        "cutpoints",
        pt.concatenate(
            [
                np.ones(1) * min,
                pt.extra_ops.cumsum(pm.Dirichlet("cuts_unknown", a=np.ones(N - 2))) * (max - min)
                + min,
            ]
        ),
    )
In [8]:
def make_model(priors, model_spec=1, constrained_uniform=False, logit=True):
    with pm.Model() as model:
        if constrained_uniform:
            cutpoints = constrainedUniform(K, 0, K)
        else:
            sigma = pm.Exponential("sigma", priors["sigma"])
            cutpoints = pm.Normal(
                "cutpoints",
                mu=priors["mu"],
                sigma=sigma,
                transform=pm.distributions.transforms.ordered,
            )

        if model_spec == 1:
            beta = pm.Normal("beta", priors["beta"][0], priors["beta"][1], size=1)
            mu = pm.Deterministic("mu", beta[0] * df.salary)
        elif model_spec == 2:
            beta = pm.Normal("beta", priors["beta"][0], priors["beta"][1], size=2)
            mu = pm.Deterministic("mu", beta[0] * df.salary + beta[1] * df.work_sat)
        elif model_spec == 3:
            beta = pm.Normal("beta", priors["beta"][0], priors["beta"][1], size=3)
            mu = pm.Deterministic(
                "mu", beta[0] * df.salary + beta[1] * df.work_sat + beta[2] * df.work_from_home
            )
        if logit:
            y_ = pm.OrderedLogistic("y", cutpoints=cutpoints, eta=mu, observed=df.explicit_rating)
        else:
            y_ = pm.OrderedProbit("y", cutpoints=cutpoints, eta=mu, observed=df.explicit_rating)
        idata = pm.sample(nuts_sampler="numpyro", idata_kwargs={"log_likelihood": True})
        idata.extend(pm.sample_posterior_predictive(idata))
    return idata, model


priors = {"sigma": 1, "beta": [0, 1], "mu": np.linspace(0, K, K - 1)}
idata1, model1 = make_model(priors, model_spec=1)
idata2, model2 = make_model(priors, model_spec=2)
idata3, model3 = make_model(priors, model_spec=3)
idata4, model4 = make_model(priors, model_spec=3, constrained_uniform=True)
idata5, model5 = make_model(priors, model_spec=3, constrained_uniform=True)
There were 86 divergences after tuning. Increase `target_accept` or reparameterize.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
Sampling: [y]


There were 83 divergences after tuning. Increase `target_accept` or reparameterize.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
Sampling: [y]


There were 1 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [y]


Sampling: [y]


Sampling: [y]


In [9]:
az.summary(idata3, var_names=["sigma", "cutpoints", "beta"])
mean sd hdi_3% hdi_97% mcse_mean mcse_sd ess_bulk ess_tail r_hat
sigma 2.237 0.733 0.845 3.551 0.017 0.012 1752.0 1723.0 1.0
cutpoints[0] -0.741 0.962 -2.649 0.953 0.023 0.017 1774.0 2082.0 1.0
cutpoints[1] 1.728 0.559 0.666 2.747 0.011 0.008 2457.0 2800.0 1.0
cutpoints[2] 3.801 0.527 2.842 4.813 0.012 0.009 1859.0 1882.0 1.0
cutpoints[3] 5.395 0.549 4.324 6.394 0.013 0.009 1716.0 1772.0 1.0
cutpoints[4] 7.047 0.602 5.891 8.185 0.015 0.010 1643.0 1705.0 1.0
cutpoints[5] 8.511 0.651 7.309 9.781 0.016 0.012 1598.0 1729.0 1.0
cutpoints[6] 10.475 0.725 9.054 11.783 0.018 0.013 1574.0 1692.0 1.0
cutpoints[7] 11.889 0.802 10.385 13.388 0.020 0.014 1614.0 1533.0 1.0
cutpoints[8] 13.237 0.939 11.543 15.113 0.023 0.016 1727.0 1741.0 1.0
beta[0] 0.133 0.011 0.113 0.154 0.000 0.000 1886.0 2062.0 1.0
beta[1] 0.211 0.278 -0.294 0.734 0.004 0.004 4208.0 3087.0 1.0
beta[2] 2.319 0.216 1.907 2.712 0.004 0.003 2496.0 2988.0 1.0
In [10]:
pm.model_to_graphviz(model3)

In [11]:
implied_probs = az.extract(idata3, var_names=["y_probs"])
implied_probs.shape
(500, 10, 4000)
In [12]:
implied_probs[0].mean(axis=1)
<xarray.DataArray 'y_probs' (y_probs_dim_1: 10)> Size: 80B
array([0.00296572, 0.02180526, 0.13602712, 0.32097524, 0.34532643,
       0.1262012 , 0.0396869 , 0.00523066, 0.00127461, 0.00050687])
Coordinates:
    y_probs_dim_0  int64 8B 0
  * y_probs_dim_1  (y_probs_dim_1) int64 80B 0 1 2 3 4 5 6 7 8 9
In [13]:
fig, ax = plt.subplots(figsize=(20, 6))
for i in range(K):
    ax.hist(implied_probs[0, i, :], label=f"Cutpoint: {i}", ec="white", bins=20, alpha=0.4)
ax.set_xlabel("Probability")
ax.set_title("Probability by Interval of Manager Rating \n by Individual 0", fontsize=20)
ax.legend();

In [14]:
implied_class = az.extract(idata3, var_names=["y"], group="posterior_predictive")
implied_class.shape
(500, 4000)
In [15]:
from scipy.stats import mode

mode(implied_class[0])
ModeResult(mode=4, count=1398)
In [16]:
fig, ax = plt.subplots(figsize=(20, 6))
ax.hist(implied_class[0], ec="white", bins=20, alpha=0.4)
ax.set_title("Distribution of Allocated Intervals for Individual 0", fontsize=20);

In [17]:
compare = az.compare(
    {
        "model_salary": idata1,
        "model_salary_worksat": idata2,
        "model_full": idata3,
        "constrained_logit_full": idata4,
        "constrained_probit_full": idata5,
    }
)

az.plot_compare(compare)
compare
rank elpd_loo p_loo elpd_diff weight se dse warning scale
model_full 0 -745.007128 11.687724 0.000000 1.000000e+00 14.843717 0.000000 False log
constrained_logit_full 1 -762.285390 8.565007 17.278262 0.000000e+00 12.979439 4.040985 False log
constrained_probit_full 2 -762.453283 8.725616 17.446156 0.000000e+00 12.991534 4.038101 False log
model_salary_worksat 3 -830.223595 7.330757 85.216467 2.369216e-13 13.914050 10.738691 False log
model_salary 4 -831.283290 7.445823 86.276162 5.598411e-11 14.583666 10.915365 False log

In [18]:
ax = az.plot_forest(
    [idata1, idata2, idata3, idata4, idata5],
    var_names=["sigma", "beta", "cutpoints"],
    combined=True,
    ridgeplot_overlap=4,
    figsize=(20, 25),
    r_hat=True,
    ridgeplot_alpha=0.3,
    model_names=[
        "model_salary",
        "model_salary_worksat",
        "model_full",
        "constrained_logit_full",
        "constrained_probit_full",
    ],
)
ax[0].set_title("Model Parameter Estimates", fontsize=20);
/Users/gzheng/projects/professional/likert_dualresponse/.venv/lib/python3.11/site-packages/arviz/stats/diagnostics.py:592: RuntimeWarning: invalid value encountered in scalar divide
  (between_chain_variance / within_chain_variance + num_samples - 1) / (num_samples)

In [19]:
az.summary(idata3, var_names=["cutpoints", "beta", "sigma"])
mean sd hdi_3% hdi_97% mcse_mean mcse_sd ess_bulk ess_tail r_hat
cutpoints[0] -0.741 0.962 -2.649 0.953 0.023 0.017 1774.0 2082.0 1.0
cutpoints[1] 1.728 0.559 0.666 2.747 0.011 0.008 2457.0 2800.0 1.0
cutpoints[2] 3.801 0.527 2.842 4.813 0.012 0.009 1859.0 1882.0 1.0
cutpoints[3] 5.395 0.549 4.324 6.394 0.013 0.009 1716.0 1772.0 1.0
cutpoints[4] 7.047 0.602 5.891 8.185 0.015 0.010 1643.0 1705.0 1.0
cutpoints[5] 8.511 0.651 7.309 9.781 0.016 0.012 1598.0 1729.0 1.0
cutpoints[6] 10.475 0.725 9.054 11.783 0.018 0.013 1574.0 1692.0 1.0
cutpoints[7] 11.889 0.802 10.385 13.388 0.020 0.014 1614.0 1533.0 1.0
cutpoints[8] 13.237 0.939 11.543 15.113 0.023 0.016 1727.0 1741.0 1.0
beta[0] 0.133 0.011 0.113 0.154 0.000 0.000 1886.0 2062.0 1.0
beta[1] 0.211 0.278 -0.294 0.734 0.004 0.004 4208.0 3087.0 1.0
beta[2] 2.319 0.216 1.907 2.712 0.004 0.003 2496.0 2988.0 1.0
sigma 2.237 0.733 0.845 3.551 0.017 0.012 1752.0 1723.0 1.0
In [20]:
def plot_fit(idata):
    posterior = idata.posterior.stack(sample=("chain", "draw"))
    fig, axs = plt.subplots(1, 2, figsize=(20, 6))
    axs = axs.flatten()
    ax = axs[0]
    for i in range(K - 1):
        ax.axvline(posterior["cutpoints"][i].mean().values, color="k")
    for r in df["explicit_rating"].unique():
        temp = df[df["explicit_rating"] == r]
        ax.hist(temp["latent_rating"], ec="white")
    ax.set_title("Latent Sentiment with Estimated Cutpoints")
    axs[1].set_title("Posterior Predictive Checks")
    az.plot_ppc(idata, ax=axs[1])
    plt.show()


plot_fit(idata3)

In [21]:
az.plot_posterior(idata3, var_names=["beta"]);

In [22]:
az.summary(idata3, var_names=["cutpoints"])
mean sd hdi_3% hdi_97% mcse_mean mcse_sd ess_bulk ess_tail r_hat
cutpoints[0] -0.741 0.962 -2.649 0.953 0.023 0.017 1774.0 2082.0 1.0
cutpoints[1] 1.728 0.559 0.666 2.747 0.011 0.008 2457.0 2800.0 1.0
cutpoints[2] 3.801 0.527 2.842 4.813 0.012 0.009 1859.0 1882.0 1.0
cutpoints[3] 5.395 0.549 4.324 6.394 0.013 0.009 1716.0 1772.0 1.0
cutpoints[4] 7.047 0.602 5.891 8.185 0.015 0.010 1643.0 1705.0 1.0
cutpoints[5] 8.511 0.651 7.309 9.781 0.016 0.012 1598.0 1729.0 1.0
cutpoints[6] 10.475 0.725 9.054 11.783 0.018 0.013 1574.0 1692.0 1.0
cutpoints[7] 11.889 0.802 10.385 13.388 0.020 0.014 1614.0 1533.0 1.0
cutpoints[8] 13.237 0.939 11.543 15.113 0.023 0.016 1727.0 1741.0 1.0
In [23]:
plot_fit(idata4)

In [24]:
az.plot_posterior(idata4, var_names=["beta"]);

In [25]:
az.summary(idata4, var_names=["cutpoints"])
/Users/gzheng/projects/professional/likert_dualresponse/.venv/lib/python3.11/site-packages/arviz/stats/diagnostics.py:592: RuntimeWarning: invalid value encountered in scalar divide
  (between_chain_variance / within_chain_variance + num_samples - 1) / (num_samples)
mean sd hdi_3% hdi_97% mcse_mean mcse_sd ess_bulk ess_tail r_hat
cutpoints[0] 0.000 0.000 0.000 0.000 0.000 0.000 4000.0 4000.0 NaN
cutpoints[1] 0.960 0.251 0.512 1.430 0.005 0.003 2923.0 2730.0 1.0
cutpoints[2] 2.596 0.218 2.199 3.012 0.003 0.002 5246.0 3327.0 1.0
cutpoints[3] 4.022 0.204 3.610 4.381 0.003 0.002 5353.0 3356.0 1.0
cutpoints[4] 5.511 0.206 5.087 5.863 0.003 0.002 4965.0 3436.0 1.0
cutpoints[5] 6.821 0.213 6.410 7.201 0.003 0.002 4131.0 3028.0 1.0
cutpoints[6] 8.490 0.211 8.094 8.885 0.004 0.003 3465.0 2656.0 1.0
cutpoints[7] 9.456 0.160 9.156 9.736 0.002 0.002 5230.0 3069.0 1.0
cutpoints[8] 10.000 0.000 10.000 10.000 0.000 0.000 3793.0 3780.0 1.0

Compare to statsmodels frequentist estimation

In [26]:
modf_logit = OrderedModel.from_formula(
    "explicit_rating ~ salary + work_sat + work_from_home", df, distr="logit"
)
resf_logit = modf_logit.fit(method="bfgs")
resf_logit.summary()
Optimization terminated successfully.
         Current function value: 1.460454
         Iterations: 78
         Function evaluations: 86
         Gradient evaluations: 86
OrderedModel Results
Dep. Variable: explicit_rating Log-Likelihood: -730.23
Model: OrderedModel AIC: 1484.
Method: Maximum Likelihood BIC: 1535.
Date: Sun, 23 Jun 2024
Time: 13:36:33
No. Observations: 500
Df Residuals: 488
Df Model: 3
coef std err z P>|z| [0.025 0.975]
salary 0.1523 0.010 15.043 0.000 0.132 0.172
work_sat 0.4690 0.289 1.621 0.105 -0.098 1.036
work_from_home 2.6517 0.214 12.391 0.000 2.232 3.071
-0.0/1.0 0.4917 1.082 0.454 0.649 -1.629 2.612
1.0/2.0 0.8069 0.428 1.884 0.060 -0.033 1.646
2.0/3.0 0.7358 0.169 4.361 0.000 0.405 1.067
3.0/4.0 0.5107 0.105 4.868 0.000 0.305 0.716
4.0/5.0 0.5646 0.081 6.952 0.000 0.405 0.724
5.0/6.0 0.4469 0.085 5.246 0.000 0.280 0.614
6.0/7.0 0.7440 0.092 8.088 0.000 0.564 0.924
7.0/8.0 0.4182 0.182 2.295 0.022 0.061 0.775
8.0/9.0 0.3358 0.332 1.013 0.311 -0.314 0.986
In [27]:
num_of_thresholds = 8
modf_logit.transform_threshold_params(resf_logit.params[-num_of_thresholds:])
array([       -inf,  0.80690515,  2.89414594,  4.56063059,  6.3193155 ,
        7.88272055,  9.98706341, 11.50621999, 12.90532916,         inf])
In [28]:
betas_posterior = az.extract(idata4)["beta"]

fig, ax = plt.subplots(figsize=(20, 10))
calc_wfh = [
    df.iloc[i]["salary"] * betas_posterior[0, :]
    + df.iloc[i]["work_sat"] * betas_posterior[1, :]
    + 1 * betas_posterior[2, :]
    for i in range(500)
]
calc_not_wfh = [
    df.iloc[i]["salary"] * betas_posterior[0, :]
    + df.iloc[i]["work_sat"] * betas_posterior[1, :]
    + 0 * betas_posterior[2, :]
    for i in range(500)
]
sal = np.random.normal(25, 5, 500)
calc_wfh_and_low_sal = [
    sal[i] * betas_posterior[0, :]
    + df.iloc[i]["work_sat"] * betas_posterior[1, :]
    + 1 * betas_posterior[2, :]
    for i in range(500)
]

### Use implied threshold on latent score to predict proportion of ratings above 7
prop_wfh = np.sum([np.mean(calc_wfh[i].values) > 6.78 for i in range(500)]) / 500
prop_not_wfh = np.sum([np.mean(calc_not_wfh[i].values) > 6.78 for i in range(500)]) / 500
prop_wfh_low = np.sum([np.mean(calc_wfh_and_low_sal[i].values) > 6.78 for i in range(500)]) / 500

for i in range(500):
    if i == 499:
        ax.hist(calc_wfh[i], alpha=0.6, color="skyblue", ec="black", label="WFH")
        ax.hist(calc_not_wfh[i], alpha=0.3, color="grey", ec="black", label="Not WFH")
        ax.hist(
            calc_wfh_and_low_sal[i], alpha=0.4, color="red", ec="black", label="WFH + Low Salary"
        )
    else:
        ax.hist(calc_wfh[i], alpha=0.6, color="skyblue", ec="black")
        ax.hist(calc_wfh_and_low_sal[i], alpha=0.4, color="red", ec="black")
        ax.hist(calc_not_wfh[i], alpha=0.3, color="grey", ec="black")
ax.set_title("Implied of Effect of Work from Home", fontsize=20)
ax.annotate(
    f"Expected Proportion > 7: \nWFH:{prop_wfh} \nWFH + LOW: {prop_wfh_low} \nNot WFH {prop_not_wfh}",
    xy=(-0.5, 1000),
    fontsize=20,
    fontweight="bold",
)
ax.legend();