Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,5 @@
^_pkgdown\.yml$
^docs$
^pkgdown$
^\.positai$
^\.claude$
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@
.DS_Store
.quarto
docs
.positai
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,5 @@ importFrom(data.table,.NGRP)
importFrom(data.table,.SD)
importFrom(data.table,data.table)
importFrom(rlang,.data)
importFrom(rlang,.env)
importFrom(stats,runif)
3 changes: 2 additions & 1 deletion R/SampleSelectR-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
#' @importFrom data.table .SD
#' @importFrom data.table :=
#' @importFrom data.table data.table
#' @importFrom stats runif
#' @importFrom rlang .data
#' @importFrom rlang .env
#' @importFrom stats runif
## usethis namespace: end
NULL
11 changes: 7 additions & 4 deletions R/allocate.R
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,10 @@ allocate <- function(allocation, N.h, n.samp = NULL, S.h = NULL, c.h = NULL, cos
}
}
# Calculate the total (raw) sample size
n <- max(ceiling(sum(allocations)), length(N.h) * lbound)


final_n <- max(ceiling(sum(allocations)), length(N.h) * lbound)


sizes <- allocations

Expand All @@ -292,7 +295,7 @@ allocate <- function(allocation, N.h, n.samp = NULL, S.h = NULL, c.h = NULL, cos
total_allocated <- sum(adjusted_allocations)

# Calculate the difference from the total n
difference <- n - total_allocated
difference <- final_n - total_allocated

# If difference is positive, distribute it proportionally
if (difference != 0) {
Expand All @@ -314,7 +317,7 @@ allocate <- function(allocation, N.h, n.samp = NULL, S.h = NULL, c.h = NULL, cos

# Step 5: Adjust the rounded allocations to ensure the sum equals n
total_allocated <- sum(rounded_allocations)
difference <- n - total_allocated
difference <- final_n - total_allocated
# Adjust the allocations by adding/subtracting the difference
while (difference != 0) {
i <- sample(1:num_groups, 1) # Randomly select an index
Expand Down Expand Up @@ -345,7 +348,7 @@ allocate <- function(allocation, N.h, n.samp = NULL, S.h = NULL, c.h = NULL, cos
}
outputs <- as.integer(rounded_allocations)
if (allocation == "optimal") {
n.print <- n
n.print <- final_n
} else {
n.print <- n.samp
}
Expand Down
4 changes: 2 additions & 2 deletions R/chromy_pps.R
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ chromy_pps <- function(frame, n, mos, outall = FALSE, curstrat = NULL) {
frame_hits <-
frame |>
tidytable::mutate(
ExpectedHits = exphits,
NumberHits = chromy_inner(exphits),
ExpectedHits = .env$exphits,
NumberHits = chromy_inner(.env$exphits),
SelectionIndicator = .data$NumberHits > 0,
SamplingWeight = ifelse(.data$SelectionIndicator, 1 / .data$ExpectedHits, NA),
)
Expand Down
18 changes: 12 additions & 6 deletions R/srs.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,25 +28,30 @@

#' @export


srs <- function(frame, n, outall = FALSE, curstrat = NULL) {
check_frame_type(frame)
check_n(n, frame, curstrat, n_le_N = TRUE)
check_outall(outall)

N <- nrow(frame)

# Take the srs and create sampling-related columns
# Take the SRS and create sampling-related columns
selectedVector <- sample(x = N, size = n, replace = FALSE)

frame <- frame |>
tidytable::mutate(
rowNum = tidytable::row_number(),
SelectionProbability = n / N,
SamplingWeight = ifelse(.data$rowNum %in% selectedVector, N / n, NA),
SelectionIndicator = ifelse(.data$rowNum %in% selectedVector, TRUE, FALSE)
SelectionProbability = .env$n / .env$N,
SamplingWeight = tidytable::if_else(
.data$rowNum %in% .env$selectedVector,
.env$N / .env$n,
NA_real_
),
SelectionIndicator = .data$rowNum %in% .env$selectedVector
) |>
tidytable::select(-tidytable::all_of("rowNum"))


# Output to screen
Sampling_Output(n, N, curstrat = curstrat)

Expand All @@ -55,8 +60,9 @@ srs <- function(frame, n, outall = FALSE, curstrat = NULL) {
sample <- frame |>
tidytable::filter(.data$SelectionIndicator) |>
tidytable::select(-tidytable::all_of("SelectionIndicator"))

return(sample)
} else if (outall) {
} else {
return(frame)
}
}
23 changes: 12 additions & 11 deletions R/sys.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#'
#' @export


sys <- function(frame, n, curstrat = NULL, outall = FALSE) {
check_frame_type(frame)
check_n(n, frame, curstrat, n_le_N = TRUE)
Expand All @@ -42,32 +43,32 @@ sys <- function(frame, n, curstrat = NULL, outall = FALSE) {
N <- nrow(frame)

# Sampling method

k <- N / n # Sampling interval

r <- runif(1, 1, k) # We use a random start between 1 and k
r <- runif(1, 1, k) # Random start between 1 and k

selectedVector <- floor(r + k * (0:(n - 1))) # Selected row indices

# We make sure that selected indices are within frame range

# Make sure selected indices are within frame range
selectedVector <- selectedVector[selectedVector <= N]

# Creating variables accordingly
# Create variables
frame <- frame |>
tidytable::mutate(
numrow = tidytable::row_number(),
SelectionIndicator = .data$numrow %in% selectedVector,
SelectionProbability = n / N,
SamplingWeight = ifelse(.data$SelectionIndicator, N / n, NA)
SelectionIndicator = .data$numrow %in% .env$selectedVector,
SelectionProbability = .env$n / .env$N,
SamplingWeight = tidytable::if_else(
.data$SelectionIndicator,
.env$N / .env$n,
NA_real_
)
) |>
tidytable::select(-tidytable::all_of("numrow"))

# Output to screen
Sampling_Output(n, N, k = k, r = r, curstrat = curstrat)

# Return only selected rows and make sure the selected sample is a data.frame, tibble, or data.table

# Return only the sample or the frame with selection indicator based on value of outall
if (outall) {
return(frame)
} else {
Expand Down
11 changes: 6 additions & 5 deletions R/sys_pps.R
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ sys_pps <- function(frame, n, mos, outall = FALSE, curstrat = NULL) {
tbd_data_1 <- frame |>
tidytable::mutate(
rowNum = tidytable::row_number(),
ExpectedHits = n * (!!(symbol_mos) / totalSize),
ExpectedHits = .env$n * (!!(symbol_mos) / .env$totalSize),
SamplingWeight = .data$ExpectedHits^-1
)

Expand All @@ -64,11 +64,12 @@ sys_pps <- function(frame, n, mos, outall = FALSE, curstrat = NULL) {
selectedVector <- findInterval(selectedSizePoints, sizeIntervals)

# Using selectedVector, get the total counts of each index

selectedVector_counts <- selectedVector |>
as.data.frame() |>
tidytable::count(selectedVector) |>
# Rename to NumberHits
tidytable::rename(NumberHits = n)
tidytable::count(selectedVector, name = "NumberHits")



tbd_data_2 <- tbd_data_1 |>
tidytable::left_join(
Expand All @@ -78,7 +79,7 @@ sys_pps <- function(frame, n, mos, outall = FALSE, curstrat = NULL) {
# Need to zero filled NumberHits
tidytable::mutate(
NumberHits = tidytable::replace_na(.data$NumberHits, replace = 0),
SelectionIndicator = .data$rowNum %in% selectedVector,
SelectionIndicator = .data$rowNum %in% .env$selectedVector,
# Make SamplingWeight to be NA if not selected
SamplingWeight = tidytable::case_when(
SelectionIndicator == TRUE ~ .data$SamplingWeight,
Expand Down
1 change: 1 addition & 0 deletions R/util.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@

#' Check if the frame is a valid data structure
#'
#' Ensures that the input frame is a data.frame, data.table, or tibble.
Expand Down
1 change: 1 addition & 0 deletions man/SampleSelectR-package.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

147 changes: 147 additions & 0 deletions tests/testthat/test-select_sample.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
test_that("n on frame does not cause issue - chromy_pps", {
county_2023_slim_n <- county_2023 |>
tidytable::select(GEOID, Region, Pop_Tot) |>
tidytable::mutate(
n = 50,
ExpHits_man = 10 * Pop_Tot / sum(Pop_Tot),
.by = "Region"
)

sampsizes <- county_2023_slim_n |>
tidytable::distinct(Region) |>
tidytable::mutate(sample_size = 10)

set.seed(12345)
samp1 <- county_2023_slim_n |>
select_sample(
"chromy_pps",
n = sampsizes,
strata = "Region",
mos = "Pop_Tot",
outall = TRUE
)
set.seed(12345)
samp2 <- county_2023_slim_n |>
tidytable::select(-n) |>
select_sample(
"chromy_pps",
n = sampsizes,
strata = "Region",
mos = "Pop_Tot",
outall = TRUE
)
expect_equal(
samp1 |> tidytable::select(-n),
samp2
)
})

test_that("n on frame does not cause issue - srs", {
county_2023_slim_n <- county_2023 |>
tidytable::select(GEOID, Region, Pop_Tot) |>
tidytable::mutate(
n = 50,
ExpHits_man = 10 * Pop_Tot / sum(Pop_Tot),
.by = "Region"
)

sampsizes <- county_2023_slim_n |>
tidytable::distinct(Region) |>
tidytable::mutate(sample_size = 10)

set.seed(12345)
samp1 <- county_2023_slim_n |>
select_sample(
"srs",
n = sampsizes,
strata = "Region",
outall = TRUE
)
set.seed(12345)
samp2 <- county_2023_slim_n |>
tidytable::select(-n) |>
select_sample(
"srs",
n = sampsizes,
strata = "Region",
outall = TRUE
)
expect_equal(
samp1 |> tidytable::select(-n),
samp2
)
})

test_that("n on frame does not cause issue - sys_pps", {
county_2023_slim_n <- county_2023 |>
tidytable::select(GEOID, Region, Pop_Tot) |>
tidytable::mutate(
n = 50,
ExpHits_man = 10 * Pop_Tot / sum(Pop_Tot),
.by = "Region"
)

sampsizes <- county_2023_slim_n |>
tidytable::distinct(Region) |>
tidytable::mutate(sample_size = 10)

set.seed(12345)
samp1 <- county_2023_slim_n |>
select_sample(
"sys_pps",
n = sampsizes,
strata = "Region",
mos = "Pop_Tot",
outall = TRUE
)
set.seed(12345)
samp2 <- county_2023_slim_n |>
tidytable::select(-n) |>
select_sample(
"sys_pps",
n = sampsizes,
strata = "Region",
mos = "Pop_Tot",
outall = TRUE
)
expect_equal(
samp1 |> tidytable::select(-n),
samp2
)
})

test_that("n on frame does not cause issue - sys", {
county_2023_slim_n <- county_2023 |>
tidytable::select(GEOID, Region, Pop_Tot) |>
tidytable::mutate(
n = 50,
ExpHits_man = 10 * Pop_Tot / sum(Pop_Tot),
.by = "Region"
)

sampsizes <- county_2023_slim_n |>
tidytable::distinct(Region) |>
tidytable::mutate(sample_size = 10)

set.seed(12345)
samp1 <- county_2023_slim_n |>
select_sample(
"sys",
n = sampsizes,
strata = "Region",
outall = TRUE
)
set.seed(12345)
samp2 <- county_2023_slim_n |>
tidytable::select(-n) |>
select_sample(
"sys",
n = sampsizes,
strata = "Region",
outall = TRUE
)
expect_equal(
samp1 |> tidytable::select(-n),
samp2
)
})
Loading