add conditional formatting to number of sampled addresses per postcode & fix error in SHS sample size check

emmaschw · emmaschw · commit 2c98c18a1e20 · 2024-06-05T08:19:44.000+01:00
diff --git a/functions/qa_export.R b/functions/qa_export.R
@@ -133,20 +133,48 @@ qa_export <- function(list_df, survey){
   
   # udprn ----
   
-  # add red/green colouring to columns containing the word 'udprn'
+  # add red colouring to columns containing the word 'udprn'
   # red if udprn isn't 0 (i.e., udprn has been previously sampled)
   
   udprn <- grep("^udprn", colnames(data))
-  if(names(list_df[i]) == "previously.sampled.udprn"){
+  if(names(list_df[i]) == "previously.sampled.udprn" & 
+     nrow(list_df[["previously.sampled.udprn"]]) != 0){
     conditionalFormatting(wb = wb, sheet = sheet,
                           cols = udprn, 
                           rows = 2:(nrow(data)+1),
                           type = "expression",
                           rule = ' != 0',
                           style = redstyle)
   }
+  
+  # sampled postcodes ----
+  
+  # add red/green colouring to sampled postcodes
+  # red if more than 10 addresses were sampled in one postcode
+ 
+  if(names(list_df[i]) == "sampled.postcodes"){
+    conditionalFormatting(wb = wb, 
+                          sheet = sheet,
+                          cols = 2, 
+                          rows = 2:(nrow(data)+1),
+                          type = "expression",
+                          rule = ' > 10',
+                          style = redstyle)
+  }
+  
+  if(names(list_df[i]) == "sampled.postcodes"){
+    conditionalFormatting(wb = wb, 
+                          sheet = sheet,
+                          cols = 2, 
+                          rows = 2:(nrow(data)+1),
+                          type = "expression",
+                          rule = ' <= 10',
+                          style = greenstyle)
   }
   
+  
+}
+  
   # export to Excel file
   path <- eval(as.name(paste0(survey, ".path")))
   saveWorkbook(wb, file = paste0(path, 
diff --git a/scripts/01_paf.R b/scripts/01_paf.R
@@ -47,7 +47,8 @@ rawpaf <-  read_csv(infilenm.path,
                                    Locality, Town, Postcode, PrintAddress,
                                    Multi_occupancy, CouncilArea, UDPRN,
                                    YCOORD, XCOORD, "2011Datazone", LACode, 
-                                   UPRN, CouncilTaxBand)) %>%
+                                   UPRN, CouncilTaxBand),
+                    show_col_types = FALSE) %>%
   clean_names_modified() %>%
   mutate(datazone = substr(x2011datazone, 1, 9),
          udprn = as.numeric(udprn))
@@ -233,8 +234,14 @@ residential <- shes.strata %>%
          shes_y2 = ifelse(shes_set == "B", 1, 0),
          shes_y3 = ifelse(shes_set == "C", 1, 0),
          shes_y4 = ifelse(shes_set == "D", 1, 0)) %>%
-  right_join(dz_info) %>%
-  right_join(residential)
+  right_join(dz_info,
+             by = join_by(dz11),
+             suffix = c('.x', '')) %>%
+  select(-contains('.x')) %>%
+  right_join(residential,
+             by = join_by(dz11),
+             suffix = c('.x', '')) %>%
+  select(-contains('.x')) 
 nrow(residential)
 
 # Remove observations with infrequent la_scode, la_code and la combination
@@ -246,7 +253,10 @@ pafaux <- residential %>%
 
 # Merge residential with pafaux
 paf_check <- residential %>% 
-  left_join(pafaux)
+  left_join(pafaux,
+            by = join_by(la_code),
+            suffix = c('', '.y')) %>%
+  select(-contains('.y'))
 nrow(paf_check)
 
 # Harmonise la and la_code variables
diff --git a/scripts/03_shs_sampling.R b/scripts/03_shs_sampling.R
@@ -135,8 +135,8 @@ shs.contractorsample <- shs.mainsample %>%
   # Merge with main sample
   right_join(shs.mainsample,
              by = join_by(udprn),
-             suffix = c('', '.y')) %>%
-  select(-contains('.y')) %>%
+             suffix = c('.x', '')) %>%
+  select(-contains('.x')) %>%
   
   # Replace NAs in houseconditionflag with 0
   mutate(houseconditionflag = replace_na(houseconditionflag, 0))
diff --git a/scripts/04_shes_checking.R b/scripts/04_shes_checking.R
@@ -202,6 +202,25 @@ table(shes.biomod.frameandmatchedsample$health_board,
 
 shes.biomod.frameandmatchedsample %>% count(health_board)
 
+### 13 - Check urban/rural by core in contractor sample ----
+
+core.qa <- contractor.sample %>%
+  filter(core == 1) %>%
+  group_by(la, sample_type) %>%
+  summarise(mean = mean(dz11_urbrur2020),
+            .groups = "drop") %>%
+  pivot_wider(names_from = sample_type,
+              values_from = mean) %>%
+  ungroup() %>%
+  mutate(diff = .[[3]] - .[[2]])
+
+if(any(core.qa$diff < -paf_sample.threshold | core.qa$diff > paf_sample.threshold)){
+  warning(print(paste0("For at least one local authority,",
+                       "the difference in urban/rural classification",
+                       "between core bio and core non-bio",
+                       "is greater than expected")))
+}
+
 ### 12 - Check data zones in contractor sample ----
 
 # Add message to inform user about progress
@@ -250,7 +269,8 @@ qa <- list(contractor.sample = contractor.sample,
            contractor.datazone = contractor.datazone.qa,
            contractor.simdq.la = contractor.simdq.qa,
            contractor.urbrur = contractor.urbrur.qa[[2]],
-           contractor.urbrur.la = contractor.urbrur.qa[[1]])
+           contractor.urbrur.la = contractor.urbrur.qa[[1]],
+           contractor.urbrur.core = core.qa)
 
 # Export to Excel
 
diff --git a/scripts/04_shs_checking.R b/scripts/04_shs_checking.R
@@ -25,7 +25,7 @@ survey <- "shs"
 source(here::here("scripts", "00_setup.R"))
 
 # Add message to inform user about progress
-cat(crayon::bold("\nExecute checking script"))
+message(title("Execute checking script"))
 
 ### 1 - Import data ----