Final project code

Final project report

# library(HDSinRdata)
# library(readr)
# library(dplyr)
# library(stringr)

library(haven)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# Data is from: https://wwwn.cdc.gov/nchs/nhanes/continuousnhanes/default.aspx?Cycle=2021-2023
# Description of data: https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Questionnaire&Cycle=2021-2023

data_alq = read_xpt("./data/ALQ_L.xpt")     # Alcohol
data_bmx = read_xpt("./data/BMX_L.xpt")     # Body measurements
data_bpq = read_xpt("./data/BPQ_L.xpt")     # Blood Pressure & Cholesterol
data_demo = read_xpt("./data/DEMO_L.xpt")   # Demographics data
data_fsq = read_xpt("./data/FSQ_L.xpt")   # Not related to health demographics and not used.
data_inq = read_xpt("./data/INQ_L.xpt")     # Income information
data_paq = read_xpt("./data/PAQ_L.xpt")     # Physical activity adult
data_paqy = read_xpt("./data/PAQY_L.xpt")   # Physical activity youth
data_smq = read_xpt("./data/SMQ_L.xpt")     # Smoke usage
data_smqr = read_xpt("./data/SMQRTU_L.xpt") # Smoking recent


# Join the data into a single df
data_total = left_join(data_alq, data_bmx)

## Joining with `by = join_by(SEQN)`

data_total = left_join(data_total, data_bpq)

## Joining with `by = join_by(SEQN)`

data_total = left_join(data_total, data_demo)

## Joining with `by = join_by(SEQN)`

# data_total = left_join(data_total, data_fsq)
data_total = left_join(data_total, data_inq)

## Joining with `by = join_by(SEQN)`

data_total = left_join(data_total, data_paq)

## Joining with `by = join_by(SEQN)`

data_total = left_join(data_total, data_paqy)

## Joining with `by = join_by(SEQN)`

data_total = left_join(data_total, data_smq)

## Joining with `by = join_by(SEQN)`

data_total = left_join(data_total, data_smqr)

## Joining with `by = join_by(SEQN)`

Questions How are age, sex, race/ethnicity, physical activity, and smoking associated with BMI? - SLR, MLR Can we predict whether an adult is obese using demographic and lifestyle predictors? - Use selection method, and PCA. Does alcohol consumption remain associated with BMI after adjusting smoking status and physical activity? - chi squared (todo: research this topic) Hypothesis testing.

# How are age, gender, race/ethnicity, physical activity, and smoking associated with BMI?

# Linear regression: BMI ~ age + gender + physical activity + smoking

model_df = data_total %>%
  transmute(
    BMI = BMXBMI,
    age = RIDAGEYR,
    gender = RIAGENDR,
    days_physical = PAD800,
    smoking = SMQ040
  ) %>%
  # Remove special missing/refused codes
  filter(
    !is.na(BMI),
    !is.na(age),
    !is.na(gender),
    !is.na(days_physical),
    !is.na(smoking),
    !days_physical %in% c(7777, 9999), # NHANES codes: 7777-Refused, 9999-Don't know
    smoking %in% c(1, 2, 3)  # NHANES codes: 1-Every day, 2-Some days, 3-Not at all
  ) %>%
  mutate(
    gender = factor(gender, levels = c(1, 2),
                          labels = c("Male", "Female")),
    smoking = factor(smoking, levels = c(1, 2, 3),
                     labels = c("Every day", "Some days", "Not at all"))
    )

# Check the smoking factor variable levels
levels(model_df$smoking)             # factor levels

## [1] "Every day"  "Some days"  "Not at all"

table(model_df$smoking)              # counts by level

## 
##  Every day  Some days Not at all 
##        489        129       1229

prop.table(table(model_df$smoking))  # proportions

## 
##  Every day  Some days Not at all 
## 0.26475365 0.06984299 0.66540336

# Set "Not at all" as the reference level used in lm() output
model_df$smoking = relevel(model_df$smoking, ref = "Not at all")


# Check the gender factor variable levels
levels(model_df$gender)             # factor levels

## [1] "Male"   "Female"

table(model_df$gender)              # counts by level

## 
##   Male Female 
##    991    856

prop.table(table(model_df$gender))  # proportions

## 
##      Male    Female 
## 0.5365457 0.4634543

# Fit model
fit_bmi = lm(BMI ~ age + gender + days_physical + smoking, data = model_df)

# Results
summary(fit_bmi)

## 
## Call:
## lm(formula = BMI ~ age + gender + days_physical + smoking, data = model_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.241  -4.544  -0.924   3.610  35.804 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      31.324653   0.648863  48.276  < 2e-16 ***
## age              -0.035952   0.009698  -3.707 0.000216 ***
## genderFemale      1.477757   0.309635   4.773 1.96e-06 ***
## days_physical    -0.002461   0.002332  -1.056 0.291277    
## smokingEvery day -1.675521   0.358476  -4.674 3.17e-06 ***
## smokingSome days -0.412059   0.615586  -0.669 0.503339    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.594 on 1841 degrees of freedom
## Multiple R-squared:  0.0299, Adjusted R-squared:  0.02727 
## F-statistic: 11.35 on 5 and 1841 DF,  p-value: 8.396e-11

# Visual diagnostics plot
par(mfrow = c(2, 2)) # divide the plotting area into 2x2 grid
plot(fit_bmi)

# Table of coefficients
coef_table = summary(fit_bmi)$coefficients
coef_table

##                      Estimate  Std. Error   t value     Pr(>|t|)
## (Intercept)      31.324652662 0.648863077 48.276214 0.000000e+00
## age              -0.035952350 0.009698200 -3.707116 2.158346e-04
## genderFemale      1.477756614 0.309635051  4.772575 1.962324e-06
## days_physical    -0.002461268 0.002331569 -1.055627 2.912770e-01
## smokingEvery day -1.675521206 0.358475726 -4.674016 3.168154e-06
## smokingSome days -0.412058922 0.615585698 -0.669377 5.033389e-01

# Non significant coefficients
nonsig = coef_table[coef_table[, "Pr(>|t|)"] > 0.05, , drop = FALSE]
nonsig

##                      Estimate  Std. Error   t value  Pr(>|t|)
## days_physical    -0.002461268 0.002331569 -1.055627 0.2912770
## smokingSome days -0.412058922 0.615585698 -0.669377 0.5033389

# install.packages("leaps")  # run once
library(leaps)

# Build matrix from same formula
x = model.matrix(BMI ~ age + gender + days_physical + smoking, data = model_df)[, -1]
y = model_df$BMI

subset_fit = regsubsets(x = x, y = y, nvmax = ncol(x), method = "exhaustive")
subset_sum = summary(subset_fit)

# Compare models by adjusted R^2, Cp, BIC
subset_sum$adjr2

## [1] 0.01276930 0.02123500 0.02748571 0.02755691 0.02726544

subset_sum$cp

## [1] 29.495040 14.431831  3.582668  4.448066  6.000000

subset_sum$bic

## [1]  -9.69495 -19.08167 -24.39562 -18.01197 -10.94013

subset_sum

## Subset selection object
## 5 Variables  (and intercept)
##                  Forced in Forced out
## age                  FALSE      FALSE
## genderFemale         FALSE      FALSE
## days_physical        FALSE      FALSE
## smokingEvery day     FALSE      FALSE
## smokingSome days     FALSE      FALSE
## 1 subsets of each size up to 5
## Selection Algorithm: exhaustive
##          age genderFemale days_physical smokingEvery day smokingSome days
## 1  ( 1 ) " " "*"          " "           " "              " "             
## 2  ( 1 ) " " "*"          " "           "*"              " "             
## 3  ( 1 ) "*" "*"          " "           "*"              " "             
## 4  ( 1 ) "*" "*"          "*"           "*"              " "             
## 5  ( 1 ) "*" "*"          "*"           "*"              "*"

# Best model size by each criterion
best_adjr2_size = which.max(subset_sum$adjr2) # identify the location of the maximum Adj. R_sq
best_cp_size    = which.min(subset_sum$cp)    # identify the location of the minimum Cp
best_bic_size   = which.min(subset_sum$bic)   # identify the location of the minimum BIC

best_adjr2_size

## [1] 4

best_cp_size

## [1] 3

best_bic_size

## [1] 3

# Coefficients for best BIC, cp, adjR^2 model
coef(subset_fit, best_adjr2_size)

##      (Intercept)              age     genderFemale    days_physical 
##     31.237434165     -0.035123772      1.481611201     -0.002483286 
## smokingEvery day 
##     -1.631353925

coef(subset_fit, best_cp_size)

##      (Intercept)              age     genderFemale smokingEvery day 
##      31.01098423      -0.03439137       1.51742579      -1.63743453

coef(subset_fit, best_bic_size)

##      (Intercept)              age     genderFemale smokingEvery day 
##      31.01098423      -0.03439137       1.51742579      -1.63743453

# Refit with selected terms
final_fit = lm(BMI ~ age + days_physical + smoking, data = model_df)
summary(final_fit)

## 
## Call:
## lm(formula = BMI ~ age + days_physical + smoking, data = model_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.410  -4.522  -1.063   3.558  36.577 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      32.136001   0.629888  51.019  < 2e-16 ***
## age              -0.036484   0.009755  -3.740  0.00019 ***
## days_physical    -0.003667   0.002332  -1.573  0.11598    
## smokingEvery day -1.713838   0.360498  -4.754 2.15e-06 ***
## smokingSome days -0.466697   0.619107  -0.754  0.45105    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.633 on 1842 degrees of freedom
## Multiple R-squared:  0.0179, Adjusted R-squared:  0.01577 
## F-statistic: 8.392 on 4 and 1842 DF,  p-value: 1.045e-06

# Plot RSS, adjusted R^2, Cp, and BIC for all of the models at once
par(mfrow = c(2, 2)) # divide the plotting area into 2x2 grid
plot(final_fit)

par(mfrow = c(2, 2)) # divide the plotting area into 2x2 grid
plot(subset_sum$rss, xlab="Number of Variables ", ylab = "RSS", type = "l")
points(which.min(subset_sum$rss),min(subset_sum$rss), col="red", cex=2, pch=20)

plot(subset_sum$adjr2, xlab = "Number of Variables ", ylab = "Adjusted RSq", type = "l")
points(best_adjr2_size, max(best_adjr2_size), col="red", cex=2, pch=20)

plot(subset_sum$cp, xlab = "Number of Variables ", ylab = "Cp", type = "l")
points(best_cp_size, best_cp_size, col="red", cex=2, pch=20)

plot(subset_sum$bic, xlab = "Number of Variables ", ylab = "BIC ", type = "l")
points(best_bic_size, best_bic_size, col="red", cex=2, pch=20)

# Predict adult obesity (BMI >= 30) from demographics + lifestyle using PCA + logistic regression.

pca_obesity_df = data_total %>%
  transmute(
    BMI = BMXBMI,
    age = RIDAGEYR,
    gender = RIAGENDR,
    race_eth = RIDRETH3,
    days_physical = PAD800,
    smoking = SMQ040
  ) %>%
  filter(
    age >= 20,
    !is.na(BMI),
    !is.na(age),
    !is.na(gender),
    !is.na(race_eth),
    !is.na(days_physical),
    !is.na(smoking),
    !days_physical %in% c(7777, 9999),
    smoking %in% c(1, 2, 3)
  ) %>%
  mutate(
    gender = factor(gender, levels = c(1, 2),
                    labels = c("Male", "Female")),
    smoking = factor(smoking, levels = c(1, 2, 3),
                     labels = c("Every day", "Some days", "Not at all")),
    smoking = relevel(smoking, ref = "Not at all"),
    race_eth = droplevels(factor(race_eth)),
    obese = factor(
      BMI >= 30,
      levels = c(FALSE, TRUE),
      labels = c("Not obese", "Obese")
    )
  )

# Build matrix from same formula
X_obesity = model.matrix(
  ~ age + gender + race_eth + days_physical + smoking,
  data = pca_obesity_df)[, -1, drop = FALSE]

pca_obesity = prcomp(X_obesity, center = TRUE, scale. = TRUE)

imp = summary(pca_obesity)$importance
imp[, seq_len(min(8L, ncol(imp))), drop = FALSE]

##                             PC1      PC2      PC3      PC4      PC5      PC6
## Standard deviation     1.334663 1.099251 1.080284 1.068391 1.040495 1.018784
## Proportion of Variance 0.178130 0.120840 0.116700 0.114150 0.108260 0.103790
## Cumulative Proportion  0.178130 0.298970 0.415670 0.529820 0.638080 0.741870
##                              PC7       PC8
## Standard deviation     0.9916625 0.9100597
## Proportion of Variance 0.0983400 0.0828200
## Cumulative Proportion  0.8402100 0.9230300

# Scree plot: proportion of variance per PC (for choosing how many PCs matter)
n_pc_obesity = ncol(pca_obesity$x)
scree_obesity_df = tibble(
  pc = seq_len(n_pc_obesity),
  proportion_variance = as.numeric(
    imp["Proportion of Variance", seq_len(n_pc_obesity), drop = TRUE]
  )
)
p_obesity_scree = ggplot(scree_obesity_df, aes(pc, proportion_variance)) +
  geom_col(width = 0.65, fill = "steelblue", alpha = 0.85) +
  geom_line(aes(group = 1), color = "#c0392b", linewidth = 0.6) +
  geom_point(color = "#c0392b", size = 2.2) +
  scale_x_continuous(breaks = scree_obesity_df$pc) +
  scale_y_continuous(labels = scales::label_percent(accuracy = 0.1)) +
  labs(
    title = "Scree plot: demographic and lifestyle predictors",
    subtitle = "BMI is not entered into PCA; components summarize age, sex, race, activity, smoking",
    x = "Principal component",
    y = "Proportion of variance"
  ) +
  theme_minimal()

# Use enough PCs for glm (cap at 4); PCA is unsupervised so we keep k modest.
k_pc = min(4L, ncol(pca_obesity$x))

pc_ob_df = as_tibble(pca_obesity$x[, seq_len(k_pc), drop = FALSE]) %>%
  mutate(obese = pca_obesity_df$obese)

pve1 = round(100 * imp["Proportion of Variance", "PC1"], 1)
pve2 = round(100 * imp["Proportion of Variance", "PC2"], 1)

p_obesity_pca = ggplot(pc_ob_df, aes(PC1, PC2, color = obese)) +
  geom_point(alpha = 0.35, size = 1.5) +
  labs(
    title = "PCA of demographic and lifestyle predictors (adults 20+)",
    subtitle = "Colored by obesity (BMI >= 30); BMI excluded from PCA",
    x = sprintf("PC1 (%s%% variance)", pve1),
    y = sprintf("PC2 (%s%% variance)", pve2),
    color = NULL
  ) +
  theme_minimal()

# Logistic regression on leading PCs (supervised step after unsupervised PCA)
pc_terms = paste0("PC", seq_len(min(4L, k_pc)))
obesity_glm_form = as.formula(paste("obese ~", paste(pc_terms, collapse = " + ")))
obesity_glm = glm(obesity_glm_form, family = binomial(), data = pc_ob_df)
summary(obesity_glm)

## 
## Call:
## glm(formula = obesity_glm_form, family = binomial(), data = pc_ob_df)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.42557    0.04804  -8.859  < 2e-16 ***
## PC1         -0.04338    0.03610  -1.202    0.229    
## PC2         -0.04569    0.04350  -1.050    0.294    
## PC3          0.04237    0.04464   0.949    0.343    
## PC4          0.20523    0.04662   4.402 1.07e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2471.7  on 1839  degrees of freedom
## Residual deviance: 2448.5  on 1835  degrees of freedom
## AIC: 2458.5
## 
## Number of Fisher Scoring iterations: 4

pc_ob_df = pc_ob_df %>%
  mutate(
    p_hat = predict(obesity_glm, type = "response"),
    pred_class = factor(
      ifelse(p_hat >= 0.5, "Obese", "Not obese"),
      levels = c("Not obese", "Obese")
    )
  )

table(observed = pc_ob_df$obese, predicted = pc_ob_df$pred_class)

##            predicted
## observed    Not obese Obese
##   Not obese      1096    14
##   Obese           714    16

mean(pc_ob_df$pred_class == pc_ob_df$obese)

## [1] 0.6043478

# Variables most aligned with PC1 (absolute loadings)
load1 = sort(abs(pca_obesity$rotation[, "PC1"]), decreasing = TRUE)
head(load1, 10)

##        race_eth3        race_eth4        race_eth7        race_eth2 
##       0.70661107       0.41448249       0.28923049       0.27712040 
##              age smokingSome days        race_eth6     genderFemale 
##       0.21663479       0.19136124       0.18153127       0.17110476 
## smokingEvery day    days_physical 
##       0.14012592       0.05513249

p_obesity_scree

p_obesity_pca

# Does alcohol consumption remain associated with BMI after adjusting smoking status and physical activity?
#   - chi squared (todo: research this topic) Hypothesis testing.
#Is alcohol use category (ALQ_L) associated with high blood pressure (BPQ_L — "ever told you had high blood pressure")?
library(haven)
library(tidyverse)
library(gmodels)
library(corrplot)

## corrplot 0.95 loaded

cst = data_demo %>%      #creating the dataframe for the chi square goodness of fit test
  inner_join(data_alq, by = "SEQN") %>%
  inner_join(data_bpq, by = "SEQN") %>%
  filter(RIDAGEYR >= 18)

cst$ALQ111[is.na(cst$ALQ111)] = 0 # replacing na values with 0 to fit testing
cst$ALQ130[is.na(cst$ALQ130)] = 0


cst["HighBP"]= c( " ")  #creating column space for high blood pressure
cst["Drinks"]= c( " ")  #column for amount of drinks had per day



cst_clean = cst%>% mutate(cst, Drinks = if_else(ALQ111 == 2, "Non-drinker",
                                         if_else(ALQ111 ==1 & ALQ130 <=2,"Light",
                                                 if_else(ALQ111 == 1 & ALQ130 <= 4, "Moderate",
                                                         if_else(ALQ111 ==1 & ALQ130 > 4, "High",
                                                                 if_else(ALQ111 == 9, NA,
                                                                         if_else(ALQ111 == ".", NA, 
                                                                                 if_else(ALQ111==1 & ALQ130 ==777, NA,
                                                                                         if_else(ALQ111==1 & ALQ130 ==999, NA,
                                                                                                 if_else(ALQ111==1 & ALQ130 == ".",NA,"Non-drinker"))))))))),
                     HighBP = if_else(BPQ080 == 1, "Yes", "No"),

      HighBP  = factor(HighBP,  levels = c("No", "Yes")),
      Drinks   = factor(Drinks,
                       levels = c("Non-drinker", "Light",
                                  "Moderate", "High")))

cst_clean= cst_clean[!is.na(cst_clean$Drinks),]

test_data = cst_clean %>% select(HighBP, Drinks)
contingency_table = table(test_data$HighBP, test_data$Drinks)
print(contingency_table)

##      
##       Non-drinker Light Moderate High
##   No          913  1959      566  377
##   Yes         501  1547      284  185

chisq_res = chisq.test(contingency_table)  # test of independence
chisq_res

## 
##  Pearson's Chi-squared test
## 
## data:  contingency_table
## X-squared = 64.234, df = 3, p-value = 7.315e-14

chisq_res$expected

##      
##       Non-drinker    Light Moderate     High
##   No     851.9283 2112.348  512.121 338.6023
##   Yes    562.0717 1393.652  337.879 223.3977

print(round(chisq_res$expected,2))

##      
##       Non-drinker   Light Moderate  High
##   No       851.93 2112.35   512.12 338.6
##   Yes      562.07 1393.65   337.88 223.4

print(round(chisq_res$residuals, 2))

##      
##       Non-drinker Light Moderate  High
##   No         2.09 -3.34     2.38  2.09
##   Yes       -2.58  4.11    -2.93 -2.57

corrplot(chisq_res$residuals, is.cor = FALSE)

#Q4 - Can we predict whether an individual or household is food secure or insecure based on the following variables

library(haven)       
library(tidyverse)
library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(rpart)       
library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 4.5.2

data_fsq$FSDAD[is.na(data_fsq$FSDAD)] = 0

data_fsq["Food_insecure"]=c(" ") #column that says whether or not the individual is classified as food insecure or not
data_fsq["On_Snap"]=c(" ")  #column that says whether or not they were receiving SNAP benefits in the last 12 months
data_fsq["EFood"]= c(" ")  #column that says whether or not they received emergency food
data_fsq["WIC"]=c(" ")   #column that says whether or not the household received WIC program benefits
data_fsq["NSnap"]= c(" ") #column that says the amount of people in a household received SNAP benefits

# adding N/A as a value to clean later on
fsq_test = data_fsq %>% mutate(data_fsq, Food_insecure= if_else(FSDAD>=3,1, #1 = yes, 2= no
                                                      if_else(FSDAD>0 & FSDAD <3, 2,
                                                              if_else(FSDAD == ".", NA, 2))),
                          On_Snap = if_else(FSQ012>2, NA,
                                                  if_else(FSQ012 == ".",NA, FSQ012)),
                  EFood = if_else(FSD151>2, NA,
                                        if_else(FSD151 == ".",NA,FSD151)),
                  WIC = if_else(FSD162>2, NA,
                                if_else(FSD162== ".", NA, FSD162)),
                  NSnap =if_else(FSD165N>7, NA,
                                 if_else(FSD165N==".",NA,FSD165N)))


demo_test = data_demo %>%
  select(SEQN, age=  RIDAGEYR, sex = RIAGENDR, race = RIDRETH3, education = DMDEDUC2, IPR = INDFMPIR, hh_size = DMDHHSIZ)

test_data2 = fsq_test %>%
    inner_join(demo_test, by = "SEQN") %>%
        select(Food_insecure,On_Snap,EFood,WIC,NSnap,age, sex, race, education, IPR, hh_size)

test_data2 = test_data2 %>% mutate(
                    Food_insecure = factor(Food_insecure, levels = c(1,2), labels = c("Insecure","Secure")),
                    sex = factor(sex, levels = c(1,2), labels = c("Male","Female")),
                    race = factor(race),
                    education = factor(education),
                    On_Snap = factor(On_Snap, levels = c(1,2), labels = c("Yes","No")),
                    EFood= factor(EFood,levels = c(1,2), labels=c("Yes","No")),
                    WIC = factor(WIC,levels = c(1,2), labels = c("Yes","No"))) 
test_data2 = test_data2 %>% na.omit(test_data2)

set.seed(1)



trainsize = floor(0.70 * nrow(test_data2))
indices = sample(seq_len(nrow(test_data2)), size = trainsize)
tdata = test_data2[-indices,]
trdata = test_data2[indices,]

prop.table(table(trdata$Food_insecure))

## 
##  Insecure    Secure 
## 0.3564972 0.6435028

FoodTree = rpart(Food_insecure~ On_Snap + EFood+ WIC + IPR + hh_size + age + sex + race + education,
                 trdata, method = "class", na.action = na.rpart, control = rpart.control(
                   cp = 0.006,
                   minsplit = 20,
                   maxdepth = 6
                 ))

printcp(FoodTree)

## 
## Classification tree:
## rpart(formula = Food_insecure ~ On_Snap + EFood + WIC + IPR + 
##     hh_size + age + sex + race + education, data = trdata, na.action = na.rpart, 
##     method = "class", control = rpart.control(cp = 0.006, minsplit = 20, 
##         maxdepth = 6))
## 
## Variables actually used in tree construction:
## [1] age       education EFood     IPR       race     
## 
## Root node error: 631/1770 = 0.3565
## 
## n= 1770 
## 
##          CP nsplit rel error  xerror     xstd
## 1 0.2250396      0   1.00000 1.00000 0.031935
## 2 0.0332805      1   0.77496 0.77496 0.029814
## 3 0.0079239      2   0.74168 0.74643 0.029464
## 4 0.0060000      7   0.70048 0.75277 0.029544

bestv = FoodTree$cptable[which.min(FoodTree$cptable[, "xerror"]),"CP"]
prunetree = prune(FoodTree,cp=bestv)
rpart.plot(prunetree,
           type = 4,
           extra = 104,
           fallen.leaves = TRUE)

tree_class = predict(prunetree, tdata, type="class")
treeprob = predict(prunetree, tdata, type = "prob")[, "Insecure"]

cm = confusionMatrix(tree_class, tdata$Food_insecure, positive = "Insecure")
print(cm)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Insecure Secure
##   Insecure      130     74
##   Secure        147    408
##                                           
##                Accuracy : 0.7088          
##                  95% CI : (0.6751, 0.7409)
##     No Information Rate : 0.635           
##     P-Value [Acc > NIR] : 1.063e-05       
##                                           
##                   Kappa : 0.3345          
##                                           
##  Mcnemar's Test P-Value : 1.277e-06       
##                                           
##             Sensitivity : 0.4693          
##             Specificity : 0.8465          
##          Pos Pred Value : 0.6373          
##          Neg Pred Value : 0.7351          
##              Prevalence : 0.3650          
##          Detection Rate : 0.1713          
##    Detection Prevalence : 0.2688          
##       Balanced Accuracy : 0.6579          
##                                           
##        'Positive' Class : Insecure        
##

FN = cm$table[2,1]  
TP = cm$table[1,1]    
FNR = FN / (FN + TP)

Final project code

2026-05-11