Skip to content

A variable called n on the frame causes issues with select_sample and ExpectedHits #7

@szimmer

Description

@szimmer

Add more text around which methods output selection probability and which output expected hits. Add more information about how expected hits is calculated.

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(SampleSelectR)
set.seed(8675309)

county_2023_slim_n <- county_2023 |>
  select(GEOID, Region, Pop_Tot) |>
  mutate(
    n=50,
    ExpHits_man=10*Pop_Tot/sum(Pop_Tot),
    .by="Region"
  )

sampsizes <- county_2023_slim_n |>
  distinct(Region) |>
  mutate(sample_size=10)

samp1 <- county_2023_slim_n |>
  select_sample("sys_pps", n=sampsizes, strata="Region", mos="Pop_Tot", outall = TRUE)
#> No sorting variables are provided so frame is assumed to be already sorted for systematic sampling.
#> Stratum: Region = South 
#> --Frame size: 1422
#> --Sample size: 10
#> --Sampling interval (k): 12763101
#> --Random start (r): 2035506
#> Stratum: Region = West 
#> --Frame size: 449
#> --Sample size: 10
#> --Sampling interval (k): 7864612
#> --Random start (r): 3760766
#> Stratum: Region = Northeast 
#> --Frame size: 218
#> --Sample size: 10
#> --Sampling interval (k): 5722245
#> --Random start (r): 4376366
#> Stratum: Region = Midwest 
#> --Frame size: 1055
#> --Sample size: 10
#> --Sampling interval (k): 6888795
#> --Random start (r): 5302221
samp2 <- county_2023_slim_n |>
  select(-n) |>
  select_sample("sys_pps", n=sampsizes, strata="Region", mos="Pop_Tot", outall = TRUE)
#> No sorting variables are provided so frame is assumed to be already sorted for systematic sampling.
#> Stratum: Region = South 
#> --Frame size: 1422
#> --Sample size: 10
#> --Sampling interval (k): 12763101
#> --Random start (r): 3427512
#> Stratum: Region = West 
#> --Frame size: 449
#> --Sample size: 10
#> --Sampling interval (k): 7864612
#> --Random start (r): 5293245
#> Stratum: Region = Northeast 
#> --Frame size: 218
#> --Sample size: 10
#> --Sampling interval (k): 5722245
#> --Random start (r): 5600881
#> Stratum: Region = Midwest 
#> --Frame size: 1055
#> --Sample size: 10
#> --Sampling interval (k): 6888795
#> --Random start (r): 5830173

samp1
#> # A tidytable: 3,144 × 9
#>    Region GEOID Pop_Tot     n ExpHits_man SelectionIndicator SamplingWeight
#>    <fct>  <chr>   <dbl> <dbl>       <dbl> <lgl>                       <dbl>
#>  1 South  01001   59285    50    0.00465  FALSE                          NA
#>  2 South  01003  239945    50    0.0188   FALSE                          NA
#>  3 South  01005   24757    50    0.00194  FALSE                          NA
#>  4 South  01007   22152    50    0.00174  FALSE                          NA
#>  5 South  01009   59292    50    0.00465  FALSE                          NA
#>  6 South  01011   10157    50    0.000796 FALSE                          NA
#>  7 South  01013   18807    50    0.00147  FALSE                          NA
#>  8 South  01015  116141    50    0.00910  FALSE                          NA
#>  9 South  01017   34450    50    0.00270  FALSE                          NA
#> 10 South  01019   25224    50    0.00198  FALSE                          NA
#> # ℹ 3,134 more rows
#> # ℹ 2 more variables: NumberHits <int>, ExpectedHits <dbl>
samp2
#> # A tidytable: 3,144 × 8
#>    Region GEOID Pop_Tot ExpHits_man SelectionIndicator SamplingWeight NumberHits
#>    <fct>  <chr>   <dbl>       <dbl> <lgl>                       <dbl>      <int>
#>  1 South  01001   59285    0.00465  FALSE                          NA          0
#>  2 South  01003  239945    0.0188   FALSE                          NA          0
#>  3 South  01005   24757    0.00194  FALSE                          NA          0
#>  4 South  01007   22152    0.00174  FALSE                          NA          0
#>  5 South  01009   59292    0.00465  FALSE                          NA          0
#>  6 South  01011   10157    0.000796 FALSE                          NA          0
#>  7 South  01013   18807    0.00147  FALSE                          NA          0
#>  8 South  01015  116141    0.00910  FALSE                          NA          0
#>  9 South  01017   34450    0.00270  FALSE                          NA          0
#> 10 South  01019   25224    0.00198  FALSE                          NA          0
#> # ℹ 3,134 more rows
#> # ℹ 1 more variable: ExpectedHits <dbl>
waldo::compare(
  samp1 |> select(-c(SelectionIndicator, SamplingWeight, NumberHits, n)),
  samp2 |> select(-c(SelectionIndicator, SamplingWeight, NumberHits))
)
#> old vs new
#>               ExpectedHits
#> - old[1, ]    2.322515e-02
#> + new[1, ]    4.645031e-03
#> - old[2, ]    9.399949e-02
#> + new[2, ]    1.879990e-02
#> - old[3, ]    9.698662e-03
#> + new[3, ]    1.939732e-03
#> - old[4, ]    8.678142e-03
#> + new[4, ]    1.735628e-03
#> - old[5, ]    2.322790e-02
#> + new[5, ]    4.645579e-03
#> - old[6, ]    3.979049e-03
#> + new[6, ]    7.958097e-04
#> - old[7, ]    7.367723e-03
#> + new[7, ]    1.473545e-03
#> - old[8, ]    4.549874e-02
#> + new[8, ]    9.099748e-03
#> - old[9, ]    1.349594e-02
#> + new[9, ]    2.699187e-03
#> - old[10, ]   9.881611e-03
#> + new[10, ]   1.976322e-03
#> and 3134 more ...
#> 
#>      old$ExpectedHits | new$ExpectedHits                  
#>  [1] 0.023225         - 0.004645         [1]              
#>  [2] 0.093999         - 0.018800         [2]              
#>  [3] 0.009699         - 0.001940         [3]              
#>  [4] 0.008678         - 0.001736         [4]              
#>  [5] 0.023228         - 0.004646         [5]              
#>  [6] 0.003979         - 0.000796         [6]              
#>  [7] 0.007368         - 0.001474         [7]              
#>  [8] 0.045499         - 0.009100         [8]              
#>  [9] 0.013496         - 0.002699         [9]              
#> [10] 0.009882         - 0.001976         [10]             
#>  ... ...                ...              and 3134 more ...

Created on 2025-11-19 with reprex v2.1.1

Session info

sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#>  setting  value
#>  version  R version 4.5.1 (2025-06-13 ucrt)
#>  os       Windows 11 x64 (build 22631)
#>  system   x86_64, mingw32
#>  ui       RTerm
#>  language (EN)
#>  collate  English_United States.utf8
#>  ctype    English_United States.utf8
#>  tz       America/New_York
#>  date     2025-11-19
#>  pandoc   3.6.3 @ C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/ (via rmarkdown)
#>  quarto   1.8.24 @ C:\\Users\\sazimmer\\AppData\\Local\\Programs\\Quarto\\bin\\quarto.exe
#> 
#> ─ Packages ───────────────────────────────────────────────────────────────────
#>  package       * version    date (UTC) lib source
#>  cli             3.6.5      2025-04-23 [1] CRAN (R 4.5.0)
#>  crayon          1.5.3      2024-06-20 [1] CRAN (R 4.5.0)
#>  data.table      1.17.8     2025-07-10 [1] RSPM (R 4.5.0)
#>  diffobj         0.3.6      2025-04-21 [1] CRAN (R 4.5.0)
#>  digest          0.6.37     2024-08-19 [1] CRAN (R 4.5.0)
#>  dplyr         * 1.1.4      2023-11-17 [1] CRAN (R 4.5.0)
#>  evaluate        1.0.5      2025-08-27 [1] RSPM (R 4.5.0)
#>  fastmap         1.2.0      2024-05-15 [1] CRAN (R 4.5.0)
#>  fs              1.6.6      2025-04-12 [1] CRAN (R 4.5.0)
#>  generics        0.1.4      2025-05-09 [1] CRAN (R 4.5.0)
#>  glue            1.8.0      2024-09-30 [1] CRAN (R 4.5.0)
#>  htmltools       0.5.8.1    2024-04-04 [1] CRAN (R 4.5.0)
#>  knitr           1.50       2025-03-16 [1] CRAN (R 4.5.0)
#>  lifecycle       1.0.4      2023-11-07 [1] CRAN (R 4.5.0)
#>  magrittr        2.0.3      2022-03-30 [1] CRAN (R 4.5.0)
#>  pillar          1.11.0     2025-07-04 [1] RSPM (R 4.5.0)
#>  pkgconfig       2.0.3      2019-09-22 [1] CRAN (R 4.5.0)
#>  R6              2.6.1      2025-02-15 [1] CRAN (R 4.5.0)
#>  reprex          2.1.1      2024-07-06 [1] CRAN (R 4.5.0)
#>  rlang           1.1.6      2025-04-11 [1] CRAN (R 4.5.0)
#>  rmarkdown       2.29       2024-11-04 [1] CRAN (R 4.5.0)
#>  rstudioapi      0.17.1     2024-10-22 [1] CRAN (R 4.5.0)
#>  SampleSelectR * 1.0.0      2025-09-22 [1] Github (rti-international/SampleSelectR@2f7d23c)
#>  sessioninfo     1.2.3.9000 2025-09-18 [1] Github (r-lib/sessioninfo@ec4dd0c)
#>  tibble          3.3.0      2025-06-08 [1] RSPM (R 4.5.0)
#>  tidyselect      1.2.1      2024-03-11 [1] CRAN (R 4.5.0)
#>  tidytable       0.11.2     2024-12-11 [1] CRAN (R 4.5.0)
#>  utf8            1.2.6      2025-06-08 [1] RSPM (R 4.5.0)
#>  vctrs           0.6.5      2023-12-01 [1] CRAN (R 4.5.0)
#>  waldo           0.6.2      2025-07-11 [1] RSPM
#>  withr           3.0.2      2024-10-28 [1] CRAN (R 4.5.0)
#>  xfun            0.53       2025-08-19 [1] RSPM (R 4.5.0)
#>  yaml            2.3.10     2024-07-26 [1] CRAN (R 4.5.0)
#> 
#>  [1] C:/Users/sazimmer/AppData/Local/R/win-library/4.5
#>  [2] C:/Program Files/R/R-4.5.1/library
#>  * ── Packages attached to the search path.
#> 
#> ──────────────────────────────────────────────────────────────────────────────

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions