snap_org_scraping

Setup

if (!require(pacman)) install.packages("pacman")

Loading required package: pacman

# Load necessary packages
pacman::p_load(
  tidyverse,
  rvest,
  httr,
  jsonlite,
  furrr,
  xml2
)

SNAP keywords

# 1. State-level SNAP program and EBT names

snap_state_programs <- tribble(
  ~state,           ~program_name,                    ~ebt_brand,
  "Alabama",        "SNAP",                           "Alabama EBT",
  "Alaska",         "SNAP",                           "Alaska Quest Card",
  "Arizona",        "Nutrition Assistance",           "Arizona Quest",
  "Arkansas",       "SNAP",                           "Arkansas EBT",
  "California",     "CalFresh",                       "Golden State Advantage",
  "Colorado",       "SNAP",                           "Colorado Quest",
  "Connecticut",    "SNAP",                           "Connecticut EBT",
  "Delaware",       "SNAP",                           "Delaware Food First",
  "Florida",        "Food Assistance Program",        "Florida EBT",
  "Georgia",        "SNAP",                           "Georgia EBT",
  "Hawaii",         "SNAP",                           "Hawaii EBT",
  "Idaho",          "Food Stamps",                    "Idaho Quest",
  "Illinois",       "SNAP",                           "Illinois Link Card",
  "Indiana",        "SNAP",                           "Hoosier Works",
  "Iowa",           "Food Assistance",                "Iowa EBT",
  "Kansas",         "Food Assistance",                "Kansas EBT",
  "Kentucky",       "SNAP",                           "Kentucky EBT",
  "Louisiana",      "SNAP",                           "Louisiana Purchase Card",
  "Maine",          "SNAP",                           "Maine EBT",
  "Maryland",       "SNAP",                           "Maryland Independence Card",
  "Massachusetts",  "SNAP",                           "Bay State Access Card",
  "Michigan",       "Food Assistance Program",        "Michigan Bridge Card",
  "Minnesota",      "SNAP",                           "Minnesota EBT",
  "Mississippi",    "SNAP",                           "Mississippi EBT",
  "Missouri",       "Food Stamp Program",             "Missouri EBT",
  "Montana",        "SNAP",                           "Montana Access",
  "Nebraska",       "SNAP",                           "Nebraska EBT",
  "Nevada",         "SNAP",                           "Nevada Quest",
  "New Hampshire",  "SNAP",                           "New Hampshire EBT",
  "New Jersey",     "NJ SNAP",                        "Families First Card",
  "New Mexico",     "SNAP",                           "New Mexico EBT",
  "New York",       "SNAP",                           "New York EBT",
  "North Carolina", "Food and Nutrition Services",    "NC EBT Card",
  "North Dakota",   "SNAP",                           "North Dakota EBT",
  "Ohio",           "SNAP",                           "Ohio Direction Card",
  "Oklahoma",       "SNAP",                           "Oklahoma Access Card",
  "Oregon",         "SNAP",                           "Oregon Trail Card",
  "Pennsylvania",   "SNAP",                           "ACCESS Card",
  "Rhode Island",   "SNAP",                           "Rhode Island EBT",
  "South Carolina", "SNAP",                           "South Carolina EBT",
  "South Dakota",   "SNAP",                           "South Dakota EBT",
  "Tennessee",      "SNAP",                           "Tennessee Benefit Security Card",
  "Texas",          "SNAP",                           "Lone Star Card",
  "Utah",           "SNAP",                           "Horizon Card",
  "Vermont",        "3SquaresVT",                     "Vermont EBT",
  "Virginia",       "SNAP",                           "Virginia EBT",
  "Washington",     "Basic Food",                     "Washington Quest",
  "West Virginia",  "SNAP",                           "Mountain State Card",
  "Wisconsin",      "FoodShare Wisconsin",            "Wisconsin Quest",
  "Wyoming",        "SNAP",                           "Wyoming EBT"
)

# 2. Build a keyword vector from this table

state_based_terms <- snap_state_programs %>%
  select(program_name, ebt_brand) %>%
  tidyr::pivot_longer(cols = everything(), values_to = "term") %>%
  filter(!is.na(term), term != "") %>%
  mutate(term = tolower(term)) %>%
  distinct(term) %>%
  pull(term)


# 3. Add generic SNAP / benefits terms

generic_snap_terms <- c(
  "snap",
  "supplemental nutrition assistance program",
  "snap benefits",
  "snap application",
  "snap applications",
  "food stamps",
  "food stamp program",
  "food assistance",
  "food assistance program",
  "food supplement program",
  "nutrition assistance",
  "nutrition assistance program",
  "public benefits",
  "benefit enrollment",
  "apply for benefits",
  "help with benefits",
  "ebt",
  "ebt card",
  "electronic benefit transfer"
)

# Variants / context (you can expand this further)
variant_and_context_terms <- c(
  "calfresh",
  "cal fresh",
  "cal-fresh",
  "food insecurity",
  "food insecure",
  "basic needs",
  "basic-needs",
  "basic needs center",
  "basic needs program"
)

snap_keywords <- unique(c(
  generic_snap_terms,
  state_based_terms,
  variant_and_context_terms))

Test URLs

test_urls <- c(
  "https://211sandiego.org/food-assistance-resources/calfresh-food-assistance/",
  "https://www.shfb.org/get-food/calfresh/",
  "https://www.csulb.edu/student-affairs/basic-needs/food",
  "https://basicneeds.berkeley.edu/calfresh",
  "https://studentaffairs.fresnostate.edu/foodsecurity/calfresh.html",
  "https://basicneeds.ucsd.edu/food-security/calfresh/index.html",
  "https://211sonoma.org/cal-fresh-food-stamps/",
  "https://www.calfreshcalpoly.org/",
  "https://chicostatecalfresh.org/",
  "https://basicneeds.ucla.edu/services/calfresh",
  "https://www.foodbankccs.org/find-food/calfresh/",
  "https://www.cdss.ca.gov/calfresh",
  "https://basicneeds.uci.edu/calfresh-application-assistance/",
  "https://basicneeds.sfsu.edu/calfresh-help-clinic",
  "https://www.csus.edu/student-affairs/crisis-assistance-resource-education-support/calfresh-application-assistance.html",
  "https://www.accfb.org/about-us/calfresh/",
  "https://basicneeds.ucsb.edu/resources/food/calfresh"
)

Scraping function

Helpers

safe_html_text <- function(x) {
  txt <- xml2::xml_text(x)
  if (length(txt) == 0) {
    return("")
  }
  txt <- gsub("\\s+", " ", txt)
  trimws(txt)
}

get_reg_domain <- function(host) {
  if (is.na(host) || host == "") return(NA_character_)
  # Very rough: last two labels, e.g. unc.edu, berkeley.edu, ucdavis.edu
  sub("^.*?([^.]+\\.[^.]+)$", "\\1", host)
}

Page-level

scrape_snap_org <- function(
  url,
  snap_keywords_vec = snap_keywords,
  timeout_sec = 15
) {
  # Normalize URL (add scheme if missing)
  if (!grepl("^https?://", url)) {
    url <- paste0("https://", url)
  }
  
  Sys.sleep(runif(1, 0.3, 0.8))
  
  # Request page
  resp <- tryCatch(
    {
      httr::GET(
        url,
        httr::user_agent("SNAP-org-scraper (research; contact: you@example.org)"),
        httr::timeout(timeout_sec)
      )
    },
    error = function(e) {
      warning(paste("Request error for URL:", url, "-", e$message))
      return(NULL)
    }
  )
  
  if (is.null(resp)) {
    return(tibble(
      url = url,
      org_name = NA_character_,
      snap_support_found = FALSE,
      snap_support_url = FALSE,
      snap_support_links = FALSE,
      snap_support_body = FALSE,
      current_path = NA_character_,
      matched_keywords = NA_character_,
      matched_link_hrefs = NA_character_,
      matched_link_texts = NA_character_,
      status_code = NA_integer_
    ))
  }
  
  status <- httr::status_code(resp)
  if (status >= 400) {
    warning(paste("HTTP error", status, "for URL:", url))
    return(tibble(
      url = url,
      org_name = NA_character_,
      snap_support_found = FALSE,
      snap_support_url = FALSE,
      snap_support_links = FALSE,
      snap_support_body = FALSE,
      current_path = NA_character_,
      matched_keywords = NA_character_,
      matched_link_hrefs = NA_character_,
      matched_link_texts = NA_character_,
      status_code = status
    ))
  }
  
  # Parse HTML
  page <- tryCatch(
    {
      content_text <- httr::content(resp, as = "text", encoding = "UTF-8")
      read_html(content_text)
    },
    error = function(e) {
      warning(paste("HTML parse error for URL:", url, "-", e$message))
      return(NULL)
    }
  )
  
  if (is.null(page)) {
    return(tibble(
      url = url,
      org_name = NA_character_,
      snap_support_found = FALSE,
      snap_support_url = FALSE,
      snap_support_links = FALSE,
      snap_support_body = FALSE,
      current_path = NA_character_,
      matched_keywords = NA_character_,
      matched_link_hrefs = NA_character_,
      matched_link_texts = NA_character_,
      status_code = status
    ))
  }
  
  ## Org name (base R only)
  org_name <- page %>%
    html_element("head > title") %>%
    safe_html_text()
  
  if (length(org_name) == 0 || is.na(org_name)) {
    org_name <- ""
  }
  
  # Strip common separators from title
  org_name <- gsub(" - .*$", "", org_name, perl = TRUE)
  org_name <- gsub("\\|.*$", "", org_name, perl = TRUE)
  org_name <- gsub("::.*$", "", org_name, perl = TRUE)
  org_name <- gsub("\\s+", " ", org_name)
  org_name <- trimws(org_name)
  
  if (identical(org_name, "")) {
    org_name_fallback <- page %>%
      html_element("h1") %>%
      safe_html_text()
    if (!is.null(org_name_fallback)) {
      org_name <- trimws(gsub("\\s+", " ", org_name_fallback))
    } else {
      org_name <- NA_character_
    }
  }
  
  ## 1) Check current URL path against each keyword
  parsed_url <- tryCatch(httr::parse_url(url), error = function(e) NULL)
  current_path <- if (!is.null(parsed_url) && !is.null(parsed_url$path)) {
    parsed_url$path
  } else {
    ""
  }
  current_path_lower <- tolower(current_path)
  
  snap_support_url <- any(vapply(
    snap_keywords,
    function(k) grepl(k, current_path_lower, ignore.case = TRUE, fixed = FALSE),
    logical(1)
  ))
  
  ## 2) Inspect links
  links <- page %>%
    html_elements("a[href]")
  
  links_df <- tibble(
    href_raw  = html_attr(links, "href"),
    link_text = safe_html_text(links)
  ) %>%
    filter(!is.na(href_raw), href_raw != "")
  
  if (nrow(links_df) > 0) {
    links_df <- links_df %>%
      mutate(
        href_abs = url_absolute(href_raw, url),
        path     = map_chr(
          href_abs,
          ~ {
            pu <- tryCatch(httr::parse_url(.x), error = function(e) NULL)
            if (is.null(pu) || is.null(pu$path)) "" else pu$path
          }
        ),
        link_text_lower = tolower(link_text),
        path_lower      = tolower(path)
      )
    
    is_snap_link <- vapply(
      seq_len(nrow(links_df)),
      function(i) {
        any(vapply(
          snap_keywords_vec,
          function(k) {
            grepl(k, links_df$link_text_lower[i], ignore.case = TRUE, fixed = FALSE) ||
              grepl(k, links_df$path_lower[i], ignore.case = TRUE, fixed = FALSE)
          },
          logical(1)
        ))
      },
      logical(1)
    )
    
    links_df$is_snap_link <- is_snap_link
    
    snap_links_df <- links_df %>% filter(is_snap_link)
    snap_support_links <- nrow(snap_links_df) > 0
    
    matched_link_hrefs <- if (snap_support_links) {
      snap_links_df$href_abs %>% unique() %>% paste(collapse = " | ")
    } else {
      NA_character_
    }
    
    matched_link_texts <- if (snap_support_links) {
      snap_links_df$link_text %>%
        gsub("\\s+", " ", .) %>%
        trimws() %>%
        unique() %>%
        paste(collapse = " | ")
    } else {
      NA_character_
    }
  } else {
    snap_support_links <- FALSE
    matched_link_hrefs <- NA_character_
    matched_link_texts <- NA_character_
  }
  
  ## 3) Body text
  page_text <- page %>%
    html_element("body") %>%
    safe_html_text()
  
  if (is.na(page_text) || length(page_text) == 0) {
    page_text <- ""
  }
  
  page_text_lower <- tolower(page_text)
  
  snap_support_body <- any(vapply(
    snap_keywords_vec,
    function(k) grepl(k, page_text_lower, ignore.case = TRUE, fixed = FALSE),
    logical(1)
  ))
  
  matched <- snap_keywords[
    vapply(
      snap_keywords_vec,
      function(k) grepl(k, page_text_lower, ignore.case = TRUE, fixed = FALSE),
      logical(1)
    )
  ]
  
  matched_keywords <- if (length(matched) == 0) NA_character_ else paste(unique(matched), collapse = "; ")
  
  ## Final flag
  snap_support_found <- snap_support_url || snap_support_links || snap_support_body
  
  tibble(
    url = url,
    org_name = org_name,
    snap_support_found = snap_support_found,
    snap_support_url = snap_support_url,
    snap_support_links = snap_support_links,
    snap_support_body = snap_support_body,
    current_path = current_path,
    matched_keywords = matched_keywords,
    matched_link_hrefs = matched_link_hrefs,
    matched_link_texts = matched_link_texts,
    status_code = status
  )
}

Org-level

scan_site_for_snap <- function(
  root_url,
  snap_keywords_vec = snap_keywords,
  max_pages = 40
) {
  if (!grepl("^https?://", root_url)) {
    root_url <- paste0("https://", root_url)
  }
  
  root_parsed <- tryCatch(httr::parse_url(root_url), error = function(e) NULL)
  root_host <- if (!is.null(root_parsed)) root_parsed$hostname else NA_character_
  root_domain <- get_reg_domain(root_host)
  
  visited <- character(0)
  to_visit <- root_url
  results <- list()
  
  while (length(to_visit) > 0 && length(visited) < max_pages) {
    current <- to_visit[1]
    to_visit <- to_visit[-1]
    
    if (current %in% visited) next
    visited <- c(visited, current)
    
    # 1) Page-level check
    res <- scrape_snap_org(
      url = current,
      snap_keywords_vec = snap_keywords_vec
    )
    results[[length(results) + 1]] <- res
    
    # Optional: if you only care that "org has at least one SNAP page", you can stop early
    # if (isTRUE(res$snap_support_found[1])) break
    
    # 2) Collect more links from this page if no SNAP found yet on it
    page <- tryCatch(
      {
        resp <- httr::GET(current, httr::timeout(10))
        if (httr::status_code(resp) >= 400) NULL
        else read_html(httr::content(resp, as = "text", encoding = "UTF-8"))
      },
      error = function(e) NULL
    )
    if (is.null(page)) next
    
    links <- page %>%
      html_elements("a[href]")
    
    hrefs <- html_attr(links, "href")
    hrefs <- hrefs[!is.na(hrefs) & hrefs != ""]
    hrefs_abs <- url_absolute(hrefs, current)
    
    same_org <- vapply(
      hrefs_abs,
      function(u) {
        pu <- tryCatch(httr::parse_url(u), error = function(e) NULL)
        if (is.null(pu) || is.null(pu$hostname)) {
          FALSE
        } else {
          get_reg_domain(pu$hostname) == root_domain
        }
      },
      logical(1)
    )
    
    new_links <- hrefs_abs[same_org]
    new_links <- setdiff(new_links, c(visited, to_visit))
    
    to_visit <- c(to_visit, new_links)
  }
  
  dplyr::bind_rows(results)
}

Testing

scan_site_for_snap(
  "https://www.unc.edu/",
  snap_keywords_vec = snap_keywords,
  max_pages = 40
)

Warning in scrape_snap_org(url = current, snap_keywords_vec =
snap_keywords_vec): HTTP error 500 for URL:
https://www.unc.edu/life/about-the-city/

# A tibble: 40 × 11
   url           org_name snap_support_found snap_support_url snap_support_links
   <chr>         <chr>    <lgl>              <lgl>            <lgl>             
 1 https://www.… The Uni… FALSE              FALSE            FALSE             
 2 https://aler… Alert C… FALSE              FALSE            FALSE             
 3 https://www.… The Uni… FALSE              FALSE            FALSE             
 4 https://www.… The Well FALSE              FALSE            FALSE             
 5 https://www.… About t… TRUE               FALSE            FALSE             
 6 https://www.… Alumni   FALSE              FALSE            FALSE             
 7 https://www.… Carolin… TRUE               FALSE            FALSE             
 8 https://iden… Univers… FALSE              FALSE            FALSE             
 9 http://uncne… Home     FALSE              FALSE            FALSE             
10 https://www.… Frequen… FALSE              FALSE            FALSE             
# ℹ 30 more rows
# ℹ 6 more variables: snap_support_body <lgl>, current_path <chr>,
#   matched_keywords <chr>, matched_link_hrefs <chr>, matched_link_texts <chr>,
#   status_code <int>