# library(HDSinRdata)
# library(readr)
# library(dplyr)
# library(stringr)
library(haven)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Data is from: https://wwwn.cdc.gov/nchs/nhanes/continuousnhanes/default.aspx?Cycle=2021-2023
# Description of data: https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Questionnaire&Cycle=2021-2023
data_alq = read_xpt("./data/ALQ_L.xpt") # Alcohol
data_bmx = read_xpt("./data/BMX_L.xpt") # Body measurements
data_bpq = read_xpt("./data/BPQ_L.xpt") # Blood Pressure & Cholesterol
data_demo = read_xpt("./data/DEMO_L.xpt") # Demographics data
data_fsq = read_xpt("./data/FSQ_L.xpt") # Not related to health demographics and not used.
data_inq = read_xpt("./data/INQ_L.xpt") # Income information
data_paq = read_xpt("./data/PAQ_L.xpt") # Physical activity adult
data_paqy = read_xpt("./data/PAQY_L.xpt") # Physical activity youth
data_smq = read_xpt("./data/SMQ_L.xpt") # Smoke usage
data_smqr = read_xpt("./data/SMQRTU_L.xpt") # Smoking recent
# Join the data into a single df
data_total = left_join(data_alq, data_bmx)
## Joining with `by = join_by(SEQN)`
data_total = left_join(data_total, data_bpq)
## Joining with `by = join_by(SEQN)`
data_total = left_join(data_total, data_demo)
## Joining with `by = join_by(SEQN)`
# data_total = left_join(data_total, data_fsq)
data_total = left_join(data_total, data_inq)
## Joining with `by = join_by(SEQN)`
data_total = left_join(data_total, data_paq)
## Joining with `by = join_by(SEQN)`
data_total = left_join(data_total, data_paqy)
## Joining with `by = join_by(SEQN)`
data_total = left_join(data_total, data_smq)
## Joining with `by = join_by(SEQN)`
data_total = left_join(data_total, data_smqr)
## Joining with `by = join_by(SEQN)`
Questions How are age, sex, race/ethnicity, physical activity, and smoking associated with BMI? - SLR, MLR Can we predict whether an adult is obese using demographic and lifestyle predictors? - Use selection method, and PCA. Does alcohol consumption remain associated with BMI after adjusting smoking status and physical activity? - chi squared (todo: research this topic) Hypothesis testing.
# How are age, gender, race/ethnicity, physical activity, and smoking associated with BMI?
# Linear regression: BMI ~ age + gender + physical activity + smoking
model_df = data_total %>%
transmute(
BMI = BMXBMI,
age = RIDAGEYR,
gender = RIAGENDR,
days_physical = PAD800,
smoking = SMQ040
) %>%
# Remove special missing/refused codes
filter(
!is.na(BMI),
!is.na(age),
!is.na(gender),
!is.na(days_physical),
!is.na(smoking),
!days_physical %in% c(7777, 9999), # NHANES codes: 7777-Refused, 9999-Don't know
smoking %in% c(1, 2, 3) # NHANES codes: 1-Every day, 2-Some days, 3-Not at all
) %>%
mutate(
gender = factor(gender, levels = c(1, 2),
labels = c("Male", "Female")),
smoking = factor(smoking, levels = c(1, 2, 3),
labels = c("Every day", "Some days", "Not at all"))
)
# Check the smoking factor variable levels
levels(model_df$smoking) # factor levels
## [1] "Every day" "Some days" "Not at all"
table(model_df$smoking) # counts by level
##
## Every day Some days Not at all
## 489 129 1229
prop.table(table(model_df$smoking)) # proportions
##
## Every day Some days Not at all
## 0.26475365 0.06984299 0.66540336
# Set "Not at all" as the reference level used in lm() output
model_df$smoking = relevel(model_df$smoking, ref = "Not at all")
# Check the gender factor variable levels
levels(model_df$gender) # factor levels
## [1] "Male" "Female"
table(model_df$gender) # counts by level
##
## Male Female
## 991 856
prop.table(table(model_df$gender)) # proportions
##
## Male Female
## 0.5365457 0.4634543
# Fit model
fit_bmi = lm(BMI ~ age + gender + days_physical + smoking, data = model_df)
# Results
summary(fit_bmi)
##
## Call:
## lm(formula = BMI ~ age + gender + days_physical + smoking, data = model_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.241 -4.544 -0.924 3.610 35.804
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31.324653 0.648863 48.276 < 2e-16 ***
## age -0.035952 0.009698 -3.707 0.000216 ***
## genderFemale 1.477757 0.309635 4.773 1.96e-06 ***
## days_physical -0.002461 0.002332 -1.056 0.291277
## smokingEvery day -1.675521 0.358476 -4.674 3.17e-06 ***
## smokingSome days -0.412059 0.615586 -0.669 0.503339
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.594 on 1841 degrees of freedom
## Multiple R-squared: 0.0299, Adjusted R-squared: 0.02727
## F-statistic: 11.35 on 5 and 1841 DF, p-value: 8.396e-11
# Visual diagnostics plot
par(mfrow = c(2, 2)) # divide the plotting area into 2x2 grid
plot(fit_bmi)
# Table of coefficients
coef_table = summary(fit_bmi)$coefficients
coef_table
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31.324652662 0.648863077 48.276214 0.000000e+00
## age -0.035952350 0.009698200 -3.707116 2.158346e-04
## genderFemale 1.477756614 0.309635051 4.772575 1.962324e-06
## days_physical -0.002461268 0.002331569 -1.055627 2.912770e-01
## smokingEvery day -1.675521206 0.358475726 -4.674016 3.168154e-06
## smokingSome days -0.412058922 0.615585698 -0.669377 5.033389e-01
# Non significant coefficients
nonsig = coef_table[coef_table[, "Pr(>|t|)"] > 0.05, , drop = FALSE]
nonsig
## Estimate Std. Error t value Pr(>|t|)
## days_physical -0.002461268 0.002331569 -1.055627 0.2912770
## smokingSome days -0.412058922 0.615585698 -0.669377 0.5033389
# install.packages("leaps") # run once
library(leaps)
# Build matrix from same formula
x = model.matrix(BMI ~ age + gender + days_physical + smoking, data = model_df)[, -1]
y = model_df$BMI
subset_fit = regsubsets(x = x, y = y, nvmax = ncol(x), method = "exhaustive")
subset_sum = summary(subset_fit)
# Compare models by adjusted R^2, Cp, BIC
subset_sum$adjr2
## [1] 0.01276930 0.02123500 0.02748571 0.02755691 0.02726544
subset_sum$cp
## [1] 29.495040 14.431831 3.582668 4.448066 6.000000
subset_sum$bic
## [1] -9.69495 -19.08167 -24.39562 -18.01197 -10.94013
subset_sum
## Subset selection object
## 5 Variables (and intercept)
## Forced in Forced out
## age FALSE FALSE
## genderFemale FALSE FALSE
## days_physical FALSE FALSE
## smokingEvery day FALSE FALSE
## smokingSome days FALSE FALSE
## 1 subsets of each size up to 5
## Selection Algorithm: exhaustive
## age genderFemale days_physical smokingEvery day smokingSome days
## 1 ( 1 ) " " "*" " " " " " "
## 2 ( 1 ) " " "*" " " "*" " "
## 3 ( 1 ) "*" "*" " " "*" " "
## 4 ( 1 ) "*" "*" "*" "*" " "
## 5 ( 1 ) "*" "*" "*" "*" "*"
# Best model size by each criterion
best_adjr2_size = which.max(subset_sum$adjr2) # identify the location of the maximum Adj. R_sq
best_cp_size = which.min(subset_sum$cp) # identify the location of the minimum Cp
best_bic_size = which.min(subset_sum$bic) # identify the location of the minimum BIC
best_adjr2_size
## [1] 4
best_cp_size
## [1] 3
best_bic_size
## [1] 3
# Coefficients for best BIC, cp, adjR^2 model
coef(subset_fit, best_adjr2_size)
## (Intercept) age genderFemale days_physical
## 31.237434165 -0.035123772 1.481611201 -0.002483286
## smokingEvery day
## -1.631353925
coef(subset_fit, best_cp_size)
## (Intercept) age genderFemale smokingEvery day
## 31.01098423 -0.03439137 1.51742579 -1.63743453
coef(subset_fit, best_bic_size)
## (Intercept) age genderFemale smokingEvery day
## 31.01098423 -0.03439137 1.51742579 -1.63743453
# Refit with selected terms
final_fit = lm(BMI ~ age + days_physical + smoking, data = model_df)
summary(final_fit)
##
## Call:
## lm(formula = BMI ~ age + days_physical + smoking, data = model_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.410 -4.522 -1.063 3.558 36.577
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.136001 0.629888 51.019 < 2e-16 ***
## age -0.036484 0.009755 -3.740 0.00019 ***
## days_physical -0.003667 0.002332 -1.573 0.11598
## smokingEvery day -1.713838 0.360498 -4.754 2.15e-06 ***
## smokingSome days -0.466697 0.619107 -0.754 0.45105
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.633 on 1842 degrees of freedom
## Multiple R-squared: 0.0179, Adjusted R-squared: 0.01577
## F-statistic: 8.392 on 4 and 1842 DF, p-value: 1.045e-06
# Plot RSS, adjusted R^2, Cp, and BIC for all of the models at once
par(mfrow = c(2, 2)) # divide the plotting area into 2x2 grid
plot(final_fit)
par(mfrow = c(2, 2)) # divide the plotting area into 2x2 grid
plot(subset_sum$rss, xlab="Number of Variables ", ylab = "RSS", type = "l")
points(which.min(subset_sum$rss),min(subset_sum$rss), col="red", cex=2, pch=20)
plot(subset_sum$adjr2, xlab = "Number of Variables ", ylab = "Adjusted RSq", type = "l")
points(best_adjr2_size, max(best_adjr2_size), col="red", cex=2, pch=20)
plot(subset_sum$cp, xlab = "Number of Variables ", ylab = "Cp", type = "l")
points(best_cp_size, best_cp_size, col="red", cex=2, pch=20)
plot(subset_sum$bic, xlab = "Number of Variables ", ylab = "BIC ", type = "l")
points(best_bic_size, best_bic_size, col="red", cex=2, pch=20)
# Predict adult obesity (BMI >= 30) from demographics + lifestyle using PCA + logistic regression.
pca_obesity_df = data_total %>%
transmute(
BMI = BMXBMI,
age = RIDAGEYR,
gender = RIAGENDR,
race_eth = RIDRETH3,
days_physical = PAD800,
smoking = SMQ040
) %>%
filter(
age >= 20,
!is.na(BMI),
!is.na(age),
!is.na(gender),
!is.na(race_eth),
!is.na(days_physical),
!is.na(smoking),
!days_physical %in% c(7777, 9999),
smoking %in% c(1, 2, 3)
) %>%
mutate(
gender = factor(gender, levels = c(1, 2),
labels = c("Male", "Female")),
smoking = factor(smoking, levels = c(1, 2, 3),
labels = c("Every day", "Some days", "Not at all")),
smoking = relevel(smoking, ref = "Not at all"),
race_eth = droplevels(factor(race_eth)),
obese = factor(
BMI >= 30,
levels = c(FALSE, TRUE),
labels = c("Not obese", "Obese")
)
)
# Build matrix from same formula
X_obesity = model.matrix(
~ age + gender + race_eth + days_physical + smoking,
data = pca_obesity_df)[, -1, drop = FALSE]
pca_obesity = prcomp(X_obesity, center = TRUE, scale. = TRUE)
imp = summary(pca_obesity)$importance
imp[, seq_len(min(8L, ncol(imp))), drop = FALSE]
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.334663 1.099251 1.080284 1.068391 1.040495 1.018784
## Proportion of Variance 0.178130 0.120840 0.116700 0.114150 0.108260 0.103790
## Cumulative Proportion 0.178130 0.298970 0.415670 0.529820 0.638080 0.741870
## PC7 PC8
## Standard deviation 0.9916625 0.9100597
## Proportion of Variance 0.0983400 0.0828200
## Cumulative Proportion 0.8402100 0.9230300
# Scree plot: proportion of variance per PC (for choosing how many PCs matter)
n_pc_obesity = ncol(pca_obesity$x)
scree_obesity_df = tibble(
pc = seq_len(n_pc_obesity),
proportion_variance = as.numeric(
imp["Proportion of Variance", seq_len(n_pc_obesity), drop = TRUE]
)
)
p_obesity_scree = ggplot(scree_obesity_df, aes(pc, proportion_variance)) +
geom_col(width = 0.65, fill = "steelblue", alpha = 0.85) +
geom_line(aes(group = 1), color = "#c0392b", linewidth = 0.6) +
geom_point(color = "#c0392b", size = 2.2) +
scale_x_continuous(breaks = scree_obesity_df$pc) +
scale_y_continuous(labels = scales::label_percent(accuracy = 0.1)) +
labs(
title = "Scree plot: demographic and lifestyle predictors",
subtitle = "BMI is not entered into PCA; components summarize age, sex, race, activity, smoking",
x = "Principal component",
y = "Proportion of variance"
) +
theme_minimal()
# Use enough PCs for glm (cap at 4); PCA is unsupervised so we keep k modest.
k_pc = min(4L, ncol(pca_obesity$x))
pc_ob_df = as_tibble(pca_obesity$x[, seq_len(k_pc), drop = FALSE]) %>%
mutate(obese = pca_obesity_df$obese)
pve1 = round(100 * imp["Proportion of Variance", "PC1"], 1)
pve2 = round(100 * imp["Proportion of Variance", "PC2"], 1)
p_obesity_pca = ggplot(pc_ob_df, aes(PC1, PC2, color = obese)) +
geom_point(alpha = 0.35, size = 1.5) +
labs(
title = "PCA of demographic and lifestyle predictors (adults 20+)",
subtitle = "Colored by obesity (BMI >= 30); BMI excluded from PCA",
x = sprintf("PC1 (%s%% variance)", pve1),
y = sprintf("PC2 (%s%% variance)", pve2),
color = NULL
) +
theme_minimal()
# Logistic regression on leading PCs (supervised step after unsupervised PCA)
pc_terms = paste0("PC", seq_len(min(4L, k_pc)))
obesity_glm_form = as.formula(paste("obese ~", paste(pc_terms, collapse = " + ")))
obesity_glm = glm(obesity_glm_form, family = binomial(), data = pc_ob_df)
summary(obesity_glm)
##
## Call:
## glm(formula = obesity_glm_form, family = binomial(), data = pc_ob_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.42557 0.04804 -8.859 < 2e-16 ***
## PC1 -0.04338 0.03610 -1.202 0.229
## PC2 -0.04569 0.04350 -1.050 0.294
## PC3 0.04237 0.04464 0.949 0.343
## PC4 0.20523 0.04662 4.402 1.07e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2471.7 on 1839 degrees of freedom
## Residual deviance: 2448.5 on 1835 degrees of freedom
## AIC: 2458.5
##
## Number of Fisher Scoring iterations: 4
pc_ob_df = pc_ob_df %>%
mutate(
p_hat = predict(obesity_glm, type = "response"),
pred_class = factor(
ifelse(p_hat >= 0.5, "Obese", "Not obese"),
levels = c("Not obese", "Obese")
)
)
table(observed = pc_ob_df$obese, predicted = pc_ob_df$pred_class)
## predicted
## observed Not obese Obese
## Not obese 1096 14
## Obese 714 16
mean(pc_ob_df$pred_class == pc_ob_df$obese)
## [1] 0.6043478
# Variables most aligned with PC1 (absolute loadings)
load1 = sort(abs(pca_obesity$rotation[, "PC1"]), decreasing = TRUE)
head(load1, 10)
## race_eth3 race_eth4 race_eth7 race_eth2
## 0.70661107 0.41448249 0.28923049 0.27712040
## age smokingSome days race_eth6 genderFemale
## 0.21663479 0.19136124 0.18153127 0.17110476
## smokingEvery day days_physical
## 0.14012592 0.05513249
p_obesity_scree
p_obesity_pca
# Does alcohol consumption remain associated with BMI after adjusting smoking status and physical activity?
# - chi squared (todo: research this topic) Hypothesis testing.
#Is alcohol use category (ALQ_L) associated with high blood pressure (BPQ_L — "ever told you had high blood pressure")?
library(haven)
library(tidyverse)
library(gmodels)
library(corrplot)
## corrplot 0.95 loaded
cst = data_demo %>% #creating the dataframe for the chi square goodness of fit test
inner_join(data_alq, by = "SEQN") %>%
inner_join(data_bpq, by = "SEQN") %>%
filter(RIDAGEYR >= 18)
cst$ALQ111[is.na(cst$ALQ111)] = 0 # replacing na values with 0 to fit testing
cst$ALQ130[is.na(cst$ALQ130)] = 0
cst["HighBP"]= c( " ") #creating column space for high blood pressure
cst["Drinks"]= c( " ") #column for amount of drinks had per day
cst_clean = cst%>% mutate(cst, Drinks = if_else(ALQ111 == 2, "Non-drinker",
if_else(ALQ111 ==1 & ALQ130 <=2,"Light",
if_else(ALQ111 == 1 & ALQ130 <= 4, "Moderate",
if_else(ALQ111 ==1 & ALQ130 > 4, "High",
if_else(ALQ111 == 9, NA,
if_else(ALQ111 == ".", NA,
if_else(ALQ111==1 & ALQ130 ==777, NA,
if_else(ALQ111==1 & ALQ130 ==999, NA,
if_else(ALQ111==1 & ALQ130 == ".",NA,"Non-drinker"))))))))),
HighBP = if_else(BPQ080 == 1, "Yes", "No"),
HighBP = factor(HighBP, levels = c("No", "Yes")),
Drinks = factor(Drinks,
levels = c("Non-drinker", "Light",
"Moderate", "High")))
cst_clean= cst_clean[!is.na(cst_clean$Drinks),]
test_data = cst_clean %>% select(HighBP, Drinks)
contingency_table = table(test_data$HighBP, test_data$Drinks)
print(contingency_table)
##
## Non-drinker Light Moderate High
## No 913 1959 566 377
## Yes 501 1547 284 185
chisq_res = chisq.test(contingency_table) # test of independence
chisq_res
##
## Pearson's Chi-squared test
##
## data: contingency_table
## X-squared = 64.234, df = 3, p-value = 7.315e-14
chisq_res$expected
##
## Non-drinker Light Moderate High
## No 851.9283 2112.348 512.121 338.6023
## Yes 562.0717 1393.652 337.879 223.3977
print(round(chisq_res$expected,2))
##
## Non-drinker Light Moderate High
## No 851.93 2112.35 512.12 338.6
## Yes 562.07 1393.65 337.88 223.4
print(round(chisq_res$residuals, 2))
##
## Non-drinker Light Moderate High
## No 2.09 -3.34 2.38 2.09
## Yes -2.58 4.11 -2.93 -2.57
corrplot(chisq_res$residuals, is.cor = FALSE)
#Q4 - Can we predict whether an individual or household is food secure or insecure based on the following variables
library(haven)
library(tidyverse)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.5.2
data_fsq$FSDAD[is.na(data_fsq$FSDAD)] = 0
data_fsq["Food_insecure"]=c(" ") #column that says whether or not the individual is classified as food insecure or not
data_fsq["On_Snap"]=c(" ") #column that says whether or not they were receiving SNAP benefits in the last 12 months
data_fsq["EFood"]= c(" ") #column that says whether or not they received emergency food
data_fsq["WIC"]=c(" ") #column that says whether or not the household received WIC program benefits
data_fsq["NSnap"]= c(" ") #column that says the amount of people in a household received SNAP benefits
# adding N/A as a value to clean later on
fsq_test = data_fsq %>% mutate(data_fsq, Food_insecure= if_else(FSDAD>=3,1, #1 = yes, 2= no
if_else(FSDAD>0 & FSDAD <3, 2,
if_else(FSDAD == ".", NA, 2))),
On_Snap = if_else(FSQ012>2, NA,
if_else(FSQ012 == ".",NA, FSQ012)),
EFood = if_else(FSD151>2, NA,
if_else(FSD151 == ".",NA,FSD151)),
WIC = if_else(FSD162>2, NA,
if_else(FSD162== ".", NA, FSD162)),
NSnap =if_else(FSD165N>7, NA,
if_else(FSD165N==".",NA,FSD165N)))
demo_test = data_demo %>%
select(SEQN, age= RIDAGEYR, sex = RIAGENDR, race = RIDRETH3, education = DMDEDUC2, IPR = INDFMPIR, hh_size = DMDHHSIZ)
test_data2 = fsq_test %>%
inner_join(demo_test, by = "SEQN") %>%
select(Food_insecure,On_Snap,EFood,WIC,NSnap,age, sex, race, education, IPR, hh_size)
test_data2 = test_data2 %>% mutate(
Food_insecure = factor(Food_insecure, levels = c(1,2), labels = c("Insecure","Secure")),
sex = factor(sex, levels = c(1,2), labels = c("Male","Female")),
race = factor(race),
education = factor(education),
On_Snap = factor(On_Snap, levels = c(1,2), labels = c("Yes","No")),
EFood= factor(EFood,levels = c(1,2), labels=c("Yes","No")),
WIC = factor(WIC,levels = c(1,2), labels = c("Yes","No")))
test_data2 = test_data2 %>% na.omit(test_data2)
set.seed(1)
trainsize = floor(0.70 * nrow(test_data2))
indices = sample(seq_len(nrow(test_data2)), size = trainsize)
tdata = test_data2[-indices,]
trdata = test_data2[indices,]
prop.table(table(trdata$Food_insecure))
##
## Insecure Secure
## 0.3564972 0.6435028
FoodTree = rpart(Food_insecure~ On_Snap + EFood+ WIC + IPR + hh_size + age + sex + race + education,
trdata, method = "class", na.action = na.rpart, control = rpart.control(
cp = 0.006,
minsplit = 20,
maxdepth = 6
))
printcp(FoodTree)
##
## Classification tree:
## rpart(formula = Food_insecure ~ On_Snap + EFood + WIC + IPR +
## hh_size + age + sex + race + education, data = trdata, na.action = na.rpart,
## method = "class", control = rpart.control(cp = 0.006, minsplit = 20,
## maxdepth = 6))
##
## Variables actually used in tree construction:
## [1] age education EFood IPR race
##
## Root node error: 631/1770 = 0.3565
##
## n= 1770
##
## CP nsplit rel error xerror xstd
## 1 0.2250396 0 1.00000 1.00000 0.031935
## 2 0.0332805 1 0.77496 0.77496 0.029814
## 3 0.0079239 2 0.74168 0.74643 0.029464
## 4 0.0060000 7 0.70048 0.75277 0.029544
bestv = FoodTree$cptable[which.min(FoodTree$cptable[, "xerror"]),"CP"]
prunetree = prune(FoodTree,cp=bestv)
rpart.plot(prunetree,
type = 4,
extra = 104,
fallen.leaves = TRUE)
tree_class = predict(prunetree, tdata, type="class")
treeprob = predict(prunetree, tdata, type = "prob")[, "Insecure"]
cm = confusionMatrix(tree_class, tdata$Food_insecure, positive = "Insecure")
print(cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Insecure Secure
## Insecure 130 74
## Secure 147 408
##
## Accuracy : 0.7088
## 95% CI : (0.6751, 0.7409)
## No Information Rate : 0.635
## P-Value [Acc > NIR] : 1.063e-05
##
## Kappa : 0.3345
##
## Mcnemar's Test P-Value : 1.277e-06
##
## Sensitivity : 0.4693
## Specificity : 0.8465
## Pos Pred Value : 0.6373
## Neg Pred Value : 0.7351
## Prevalence : 0.3650
## Detection Rate : 0.1713
## Detection Prevalence : 0.2688
## Balanced Accuracy : 0.6579
##
## 'Positive' Class : Insecure
##
FN = cm$table[2,1]
TP = cm$table[1,1]
FNR = FN / (FN + TP)