if (!require(pacman)) install.packages("pacman")Loading required package: pacman
# Load necessary packages
pacman::p_load(
tidyverse,
rvest,
httr,
jsonlite,
furrr,
xml2
)# 1. State-level SNAP program and EBT names
snap_state_programs <- tribble(
~state, ~program_name, ~ebt_brand,
"Alabama", "SNAP", "Alabama EBT",
"Alaska", "SNAP", "Alaska Quest Card",
"Arizona", "Nutrition Assistance", "Arizona Quest",
"Arkansas", "SNAP", "Arkansas EBT",
"California", "CalFresh", "Golden State Advantage",
"Colorado", "SNAP", "Colorado Quest",
"Connecticut", "SNAP", "Connecticut EBT",
"Delaware", "SNAP", "Delaware Food First",
"Florida", "Food Assistance Program", "Florida EBT",
"Georgia", "SNAP", "Georgia EBT",
"Hawaii", "SNAP", "Hawaii EBT",
"Idaho", "Food Stamps", "Idaho Quest",
"Illinois", "SNAP", "Illinois Link Card",
"Indiana", "SNAP", "Hoosier Works",
"Iowa", "Food Assistance", "Iowa EBT",
"Kansas", "Food Assistance", "Kansas EBT",
"Kentucky", "SNAP", "Kentucky EBT",
"Louisiana", "SNAP", "Louisiana Purchase Card",
"Maine", "SNAP", "Maine EBT",
"Maryland", "SNAP", "Maryland Independence Card",
"Massachusetts", "SNAP", "Bay State Access Card",
"Michigan", "Food Assistance Program", "Michigan Bridge Card",
"Minnesota", "SNAP", "Minnesota EBT",
"Mississippi", "SNAP", "Mississippi EBT",
"Missouri", "Food Stamp Program", "Missouri EBT",
"Montana", "SNAP", "Montana Access",
"Nebraska", "SNAP", "Nebraska EBT",
"Nevada", "SNAP", "Nevada Quest",
"New Hampshire", "SNAP", "New Hampshire EBT",
"New Jersey", "NJ SNAP", "Families First Card",
"New Mexico", "SNAP", "New Mexico EBT",
"New York", "SNAP", "New York EBT",
"North Carolina", "Food and Nutrition Services", "NC EBT Card",
"North Dakota", "SNAP", "North Dakota EBT",
"Ohio", "SNAP", "Ohio Direction Card",
"Oklahoma", "SNAP", "Oklahoma Access Card",
"Oregon", "SNAP", "Oregon Trail Card",
"Pennsylvania", "SNAP", "ACCESS Card",
"Rhode Island", "SNAP", "Rhode Island EBT",
"South Carolina", "SNAP", "South Carolina EBT",
"South Dakota", "SNAP", "South Dakota EBT",
"Tennessee", "SNAP", "Tennessee Benefit Security Card",
"Texas", "SNAP", "Lone Star Card",
"Utah", "SNAP", "Horizon Card",
"Vermont", "3SquaresVT", "Vermont EBT",
"Virginia", "SNAP", "Virginia EBT",
"Washington", "Basic Food", "Washington Quest",
"West Virginia", "SNAP", "Mountain State Card",
"Wisconsin", "FoodShare Wisconsin", "Wisconsin Quest",
"Wyoming", "SNAP", "Wyoming EBT"
)
# 2. Build a keyword vector from this table
state_based_terms <- snap_state_programs %>%
select(program_name, ebt_brand) %>%
tidyr::pivot_longer(cols = everything(), values_to = "term") %>%
filter(!is.na(term), term != "") %>%
mutate(term = tolower(term)) %>%
distinct(term) %>%
pull(term)
# 3. Add generic SNAP / benefits terms
generic_snap_terms <- c(
"snap",
"supplemental nutrition assistance program",
"snap benefits",
"snap application",
"snap applications",
"food stamps",
"food stamp program",
"food assistance",
"food assistance program",
"food supplement program",
"nutrition assistance",
"nutrition assistance program",
"public benefits",
"benefit enrollment",
"apply for benefits",
"help with benefits",
"ebt",
"ebt card",
"electronic benefit transfer"
)
# Variants / context (you can expand this further)
variant_and_context_terms <- c(
"calfresh",
"cal fresh",
"cal-fresh",
"food insecurity",
"food insecure",
"basic needs",
"basic-needs",
"basic needs center",
"basic needs program"
)
snap_keywords <- unique(c(
generic_snap_terms,
state_based_terms,
variant_and_context_terms))test_urls <- c(
"https://211sandiego.org/food-assistance-resources/calfresh-food-assistance/",
"https://www.shfb.org/get-food/calfresh/",
"https://www.csulb.edu/student-affairs/basic-needs/food",
"https://basicneeds.berkeley.edu/calfresh",
"https://studentaffairs.fresnostate.edu/foodsecurity/calfresh.html",
"https://basicneeds.ucsd.edu/food-security/calfresh/index.html",
"https://211sonoma.org/cal-fresh-food-stamps/",
"https://www.calfreshcalpoly.org/",
"https://chicostatecalfresh.org/",
"https://basicneeds.ucla.edu/services/calfresh",
"https://www.foodbankccs.org/find-food/calfresh/",
"https://www.cdss.ca.gov/calfresh",
"https://basicneeds.uci.edu/calfresh-application-assistance/",
"https://basicneeds.sfsu.edu/calfresh-help-clinic",
"https://www.csus.edu/student-affairs/crisis-assistance-resource-education-support/calfresh-application-assistance.html",
"https://www.accfb.org/about-us/calfresh/",
"https://basicneeds.ucsb.edu/resources/food/calfresh"
)scrape_snap_org <- function(
url,
snap_keywords_vec = snap_keywords,
timeout_sec = 15
) {
# Normalize URL (add scheme if missing)
if (!grepl("^https?://", url)) {
url <- paste0("https://", url)
}
Sys.sleep(runif(1, 0.3, 0.8))
# Request page
resp <- tryCatch(
{
httr::GET(
url,
httr::user_agent("SNAP-org-scraper (research; contact: you@example.org)"),
httr::timeout(timeout_sec)
)
},
error = function(e) {
warning(paste("Request error for URL:", url, "-", e$message))
return(NULL)
}
)
if (is.null(resp)) {
return(tibble(
url = url,
org_name = NA_character_,
snap_support_found = FALSE,
snap_support_url = FALSE,
snap_support_links = FALSE,
snap_support_body = FALSE,
current_path = NA_character_,
matched_keywords = NA_character_,
matched_link_hrefs = NA_character_,
matched_link_texts = NA_character_,
status_code = NA_integer_
))
}
status <- httr::status_code(resp)
if (status >= 400) {
warning(paste("HTTP error", status, "for URL:", url))
return(tibble(
url = url,
org_name = NA_character_,
snap_support_found = FALSE,
snap_support_url = FALSE,
snap_support_links = FALSE,
snap_support_body = FALSE,
current_path = NA_character_,
matched_keywords = NA_character_,
matched_link_hrefs = NA_character_,
matched_link_texts = NA_character_,
status_code = status
))
}
# Parse HTML
page <- tryCatch(
{
content_text <- httr::content(resp, as = "text", encoding = "UTF-8")
read_html(content_text)
},
error = function(e) {
warning(paste("HTML parse error for URL:", url, "-", e$message))
return(NULL)
}
)
if (is.null(page)) {
return(tibble(
url = url,
org_name = NA_character_,
snap_support_found = FALSE,
snap_support_url = FALSE,
snap_support_links = FALSE,
snap_support_body = FALSE,
current_path = NA_character_,
matched_keywords = NA_character_,
matched_link_hrefs = NA_character_,
matched_link_texts = NA_character_,
status_code = status
))
}
## Org name (base R only)
org_name <- page %>%
html_element("head > title") %>%
safe_html_text()
if (length(org_name) == 0 || is.na(org_name)) {
org_name <- ""
}
# Strip common separators from title
org_name <- gsub(" - .*$", "", org_name, perl = TRUE)
org_name <- gsub("\\|.*$", "", org_name, perl = TRUE)
org_name <- gsub("::.*$", "", org_name, perl = TRUE)
org_name <- gsub("\\s+", " ", org_name)
org_name <- trimws(org_name)
if (identical(org_name, "")) {
org_name_fallback <- page %>%
html_element("h1") %>%
safe_html_text()
if (!is.null(org_name_fallback)) {
org_name <- trimws(gsub("\\s+", " ", org_name_fallback))
} else {
org_name <- NA_character_
}
}
## 1) Check current URL path against each keyword
parsed_url <- tryCatch(httr::parse_url(url), error = function(e) NULL)
current_path <- if (!is.null(parsed_url) && !is.null(parsed_url$path)) {
parsed_url$path
} else {
""
}
current_path_lower <- tolower(current_path)
snap_support_url <- any(vapply(
snap_keywords,
function(k) grepl(k, current_path_lower, ignore.case = TRUE, fixed = FALSE),
logical(1)
))
## 2) Inspect links
links <- page %>%
html_elements("a[href]")
links_df <- tibble(
href_raw = html_attr(links, "href"),
link_text = safe_html_text(links)
) %>%
filter(!is.na(href_raw), href_raw != "")
if (nrow(links_df) > 0) {
links_df <- links_df %>%
mutate(
href_abs = url_absolute(href_raw, url),
path = map_chr(
href_abs,
~ {
pu <- tryCatch(httr::parse_url(.x), error = function(e) NULL)
if (is.null(pu) || is.null(pu$path)) "" else pu$path
}
),
link_text_lower = tolower(link_text),
path_lower = tolower(path)
)
is_snap_link <- vapply(
seq_len(nrow(links_df)),
function(i) {
any(vapply(
snap_keywords_vec,
function(k) {
grepl(k, links_df$link_text_lower[i], ignore.case = TRUE, fixed = FALSE) ||
grepl(k, links_df$path_lower[i], ignore.case = TRUE, fixed = FALSE)
},
logical(1)
))
},
logical(1)
)
links_df$is_snap_link <- is_snap_link
snap_links_df <- links_df %>% filter(is_snap_link)
snap_support_links <- nrow(snap_links_df) > 0
matched_link_hrefs <- if (snap_support_links) {
snap_links_df$href_abs %>% unique() %>% paste(collapse = " | ")
} else {
NA_character_
}
matched_link_texts <- if (snap_support_links) {
snap_links_df$link_text %>%
gsub("\\s+", " ", .) %>%
trimws() %>%
unique() %>%
paste(collapse = " | ")
} else {
NA_character_
}
} else {
snap_support_links <- FALSE
matched_link_hrefs <- NA_character_
matched_link_texts <- NA_character_
}
## 3) Body text
page_text <- page %>%
html_element("body") %>%
safe_html_text()
if (is.na(page_text) || length(page_text) == 0) {
page_text <- ""
}
page_text_lower <- tolower(page_text)
snap_support_body <- any(vapply(
snap_keywords_vec,
function(k) grepl(k, page_text_lower, ignore.case = TRUE, fixed = FALSE),
logical(1)
))
matched <- snap_keywords[
vapply(
snap_keywords_vec,
function(k) grepl(k, page_text_lower, ignore.case = TRUE, fixed = FALSE),
logical(1)
)
]
matched_keywords <- if (length(matched) == 0) NA_character_ else paste(unique(matched), collapse = "; ")
## Final flag
snap_support_found <- snap_support_url || snap_support_links || snap_support_body
tibble(
url = url,
org_name = org_name,
snap_support_found = snap_support_found,
snap_support_url = snap_support_url,
snap_support_links = snap_support_links,
snap_support_body = snap_support_body,
current_path = current_path,
matched_keywords = matched_keywords,
matched_link_hrefs = matched_link_hrefs,
matched_link_texts = matched_link_texts,
status_code = status
)
}scan_site_for_snap <- function(
root_url,
snap_keywords_vec = snap_keywords,
max_pages = 40
) {
if (!grepl("^https?://", root_url)) {
root_url <- paste0("https://", root_url)
}
root_parsed <- tryCatch(httr::parse_url(root_url), error = function(e) NULL)
root_host <- if (!is.null(root_parsed)) root_parsed$hostname else NA_character_
root_domain <- get_reg_domain(root_host)
visited <- character(0)
to_visit <- root_url
results <- list()
while (length(to_visit) > 0 && length(visited) < max_pages) {
current <- to_visit[1]
to_visit <- to_visit[-1]
if (current %in% visited) next
visited <- c(visited, current)
# 1) Page-level check
res <- scrape_snap_org(
url = current,
snap_keywords_vec = snap_keywords_vec
)
results[[length(results) + 1]] <- res
# Optional: if you only care that "org has at least one SNAP page", you can stop early
# if (isTRUE(res$snap_support_found[1])) break
# 2) Collect more links from this page if no SNAP found yet on it
page <- tryCatch(
{
resp <- httr::GET(current, httr::timeout(10))
if (httr::status_code(resp) >= 400) NULL
else read_html(httr::content(resp, as = "text", encoding = "UTF-8"))
},
error = function(e) NULL
)
if (is.null(page)) next
links <- page %>%
html_elements("a[href]")
hrefs <- html_attr(links, "href")
hrefs <- hrefs[!is.na(hrefs) & hrefs != ""]
hrefs_abs <- url_absolute(hrefs, current)
same_org <- vapply(
hrefs_abs,
function(u) {
pu <- tryCatch(httr::parse_url(u), error = function(e) NULL)
if (is.null(pu) || is.null(pu$hostname)) {
FALSE
} else {
get_reg_domain(pu$hostname) == root_domain
}
},
logical(1)
)
new_links <- hrefs_abs[same_org]
new_links <- setdiff(new_links, c(visited, to_visit))
to_visit <- c(to_visit, new_links)
}
dplyr::bind_rows(results)
}Warning in scrape_snap_org(url = current, snap_keywords_vec =
snap_keywords_vec): HTTP error 500 for URL:
https://www.unc.edu/life/about-the-city/
# A tibble: 40 × 11
url org_name snap_support_found snap_support_url snap_support_links
<chr> <chr> <lgl> <lgl> <lgl>
1 https://www.… The Uni… FALSE FALSE FALSE
2 https://aler… Alert C… FALSE FALSE FALSE
3 https://www.… The Uni… FALSE FALSE FALSE
4 https://www.… The Well FALSE FALSE FALSE
5 https://www.… About t… TRUE FALSE FALSE
6 https://www.… Alumni FALSE FALSE FALSE
7 https://www.… Carolin… TRUE FALSE FALSE
8 https://iden… Univers… FALSE FALSE FALSE
9 http://uncne… Home FALSE FALSE FALSE
10 https://www.… Frequen… FALSE FALSE FALSE
# ℹ 30 more rows
# ℹ 6 more variables: snap_support_body <lgl>, current_path <chr>,
# matched_keywords <chr>, matched_link_hrefs <chr>, matched_link_texts <chr>,
# status_code <int>