Preparation of reefCloud data

Author

Murray Logan

Published

March 31, 2024

1 Synopsis

In the previous tutorial, we created synthetic reefCloud data. In the current tutorial, we will prepare these data for statistical analyses. Recall that we created two data sets, one representing a fixed sampling design, the other representing a random sampling design. We will prepare both of these data sets.

Necessary wrangling (preparation) steps:

  1. exclude extraneous (unneeded) fields
  2. exclude poor images
  3. lengthen the data with respect to classification type
  4. join to a labelset lookup
  5. tally up the points per date/image/GROUP/type
  6. recode transect id
  7. fill in the gaps and add the zeros
  8. sum to transect level
  9. generate a Year field from the sample date

2 Preparations

We will start by loading the required r packages.

library(knitr)
library(tidyverse)
library(easystats)
library(sf)

This tutorial will prepare the data sets generated at the end of the previous tutorials. These datasets (reef_data_synthetic_fixed.csv and reef_data_synthetic_random.csv) are reasonably large (take up substantial disk space on a repository). In order to keep the repository containing these tutorials to a manageable size, rather than track the final, large data sets, I have instead versioned the much smaller penultimate data sets (data_fixed_locs_obs.RData and data_random_locs_obs.RData). Hence, before starting this tutorial, we will first read in these penultimate data sets and repeat the final preparation and writing steps of the last tutorial. If you are working through these tutorials in sequential order and already have created the final data sets, there is no need to repeat this step.

Number_of_transects_per_site <- 5
Depths <- 2
Number_of_frames_per_transect <- 100
Points_per_frame <- 5


## Note, the following are on the link scale
hcc_site_sigma <- 0.5        # variability in Sites within Locations
hcc_transect_sigma <- 0.2    # variability in Transects within Sites
hcc_sigma <- 0.1             # random noise

sc_site_sigma <- 0.05        # variability in Sites within Locations
sc_transect_sigma <- 0.02    # variability in Transects within Sites
sc_sigma <- 0.01             # random noise

ma_site_sigma <- 0.5        # variability in Sites within Locations
ma_transect_sigma <- 0.2    # variability in Transects within Sites
ma_sigma <- 0.1             # random noise

Fixed design

## Need to split the percentage cover into point and frames
load(file = "../data/data_fixed_locs_obs.RData")
data_fixed_locs_obs <- data_fixed_locs_obs |>
  group_by(Reef,Site,Transect,Year,Depth,Date) |>
  mutate(Points = round(Number_of_frames_per_transect *
                          Points_per_frame *
                          (Value/sum(Value)),0),
    Points = ifelse(Points<0, 0, Points)) |>
  tidyr::uncount(Points) |>
  sample_n(n(), replace=FALSE) |>
  mutate(POINT_NO = rep_len(1:Points_per_frame, length = n()),
    ## FRAME = 1 + cumsum(POINT_NO) %/% (sum(1:Points_per_frame) + 1e-10)) |>
    FRAME = rep(1:Number_of_frames_per_transect, each=Points_per_frame, length = n())) |>
  ungroup() 

## a |> group_by(Reef, Site, Transect, Year, Depth, Group) |>
##     summarise(Count = n()) |>
##     ungroup(Group) |>
##     mutate(Total=sum(Count),
##            Cover = Count/Total)

reef_data_synthetic_fixed <-
  data_fixed_locs_obs |>
  mutate(
    project_id = 1,
    project_name = "synthetic_fixed",
    SITE_NO = str_replace(Site, "^S", "Site "),
    TRANSECT_NO = str_replace(Transect, "^T", "Transect "),
    site_name = factor(paste(Reef, SITE_NO)),
    site_id = as.numeric(site_name),
    site_latitude = Latitude,
    site_longitude = Longitude,
    site_depth = Depth,
    site_country = "synthetic Country",
    site_reef_name = factor(Reef),
    site_reef_type = NA,
    site_reef_zone = NA,
    site_code = NA,
    site_management = NA,
    survey_title = factor(paste(Reef, SITE_NO, TRANSECT_NO, format(Date, "%Y-%m-%d"))),
    survey_id = as.numeric(survey_title),
    survey_start_date = Date,
    survey_depth = Depth,
    survey_transect_number = as.numeric(str_replace(TRANSECT_NO, "Transect ", "")),
    image_name = factor(paste(survey_title, FRAME)),
    image_id = as.numeric(image_name),
    image_quality = 100,
    point_no = POINT_NO,
    point_id = as.numeric(factor(paste(image_name, POINT_NO))),
    point_machine_classification = Group
  ) |>
  dplyr::select(
    project_id,
    project_name,
    site_id,
    site_name,
    site_latitude,
    site_longitude,
    site_depth,
    site_country,
    site_reef_name,
    site_reef_type,
    site_reef_zone,
    site_code,
    site_management,
    survey_id,
    survey_title,
    survey_start_date,
    survey_depth,
    survey_transect_number,
    image_id,
    image_name,
    image_quality,
    point_id,
    point_no,
    point_machine_classification
  )
  ##   PCODE = "SYNTHETIC-fixed",
  ##   ID = 1:n(),
  ##   CRUISE_CODE = paste0("SYNTHETIC",Year),
  ##   REEF_NAME = Reef,
  ##   AIMS_REEF_NAME = Reef,
  ##   SECTOR = "synthetic",
  ##   LATITUDE = Latitude,
  ##   LONGITUDE = Longitude,
  ##   SITE_NO = Site,
  ##   TRANSECT_NO = Transect,
  ##   SITE_DEPTH = Depth,
  ##   REEF_ZONE = "-",
  ##   REPORT_YEAR = Year,
  ##   SURVEY_DATE = Date,
  ##   FRAME = paste0(PCODE, "/", REEF_NAME, "/",
  ##     REEF_ZONE, "/", SITE_NO, "/", SITE_DEPTH,
  ##     "/", TRANSECT_NO, "/", REPORT_YEAR, "/", FRAME),
  ##   POINT_NO = POINT_NO,
  ##   FAMILY = NA,
  ##   GROUP_DESC = Group,
  ##   REEFPAGE_CATEGORY = paste0(Group,"_alt")
  ## ) |>
  ## dplyr::select(PCODE, ID, CRUISE_CODE, REEF_NAME,
  ##   AIMS_REEF_NAME, SECTOR,
  ##   LATITUDE, LONGITUDE, SITE_NO, TRANSECT_NO, SITE_DEPTH,
  ##   REEF_ZONE, REPORT_YEAR, SURVEY_DATE, FRAME, POINT_NO,
  ##   FAMILY, GROUP_DESC, REEFPAGE_CATEGORY)

write_csv(reef_data_synthetic_fixed,
  file = "../data/reef_data_synthetic_fixed.csv"
)
rmarkdown::paged_table(reef_data_synthetic_fixed |> head()) 

Random design

## Need to split the percentage cover into point and frames
load(file = "../data/data_random_locs_obs.RData")
data_random_locs_obs <- data_random_locs_obs |>
  group_by(Reef, Site, Transect, Year, Depth, Date) |>
  mutate(
    Points = round(Number_of_frames_per_transect *
      Points_per_frame * (Value / sum(Value)), 0),
    Points = ifelse(Points < 0, 0, Points)
  ) |>
  tidyr::uncount(Points) |>
  sample_n(n(), replace = FALSE) |>
  mutate(
    POINT_NO = rep_len(1:Points_per_frame, length = n()),
    ## FRAME = 1 + cumsum(POINT_NO) %/% (sum(1:Points_per_frame) + 1e-10)) |>
    FRAME = rep(1:Number_of_frames_per_transect,
      each = Points_per_frame, length = n()
    )
  ) |>
  ungroup()

reef_data_synthetic_random <-
  data_random_locs_obs |>
  mutate(
    project_id = 1,
    project_name = "synthetic_fixed",
    SITE_NO = str_replace(Site, "^S", "Site "),
    TRANSECT_NO = str_replace(Transect, "^T", "Transect "),
    site_name = factor(paste(Reef, SITE_NO)),
    site_id = as.numeric(site_name),
    site_latitude = Latitude,
    site_longitude = Longitude,
    site_depth = Depth,
    site_country = "synthetic Country",
    site_reef_name = factor(Reef),
    site_reef_type = NA,
    site_reef_zone = NA,
    site_code = NA,
    site_management = NA,
    survey_title = factor(paste(Reef, SITE_NO, TRANSECT_NO, format(Date, "%Y-%m-%d"))),
    survey_id = as.numeric(survey_title),
    survey_start_date = Date,
    survey_depth = Depth,
    survey_transect_number = as.numeric(str_replace(TRANSECT_NO, "Transect ", "")),
    image_name = factor(paste(survey_title, FRAME)),
    image_id = as.numeric(image_name),
    image_quality = 100,
    point_no = POINT_NO,
    point_id = as.numeric(factor(paste(image_name, POINT_NO))),
    point_machine_classification = Group
  ) |>
  dplyr::select(
    project_id,
    project_name,
    site_id,
    site_name,
    site_latitude,
    site_longitude,
    site_depth,
    site_country,
    site_reef_name,
    site_reef_type,
    site_reef_zone,
    site_code,
    site_management,
    survey_id,
    survey_title,
    survey_start_date,
    survey_depth,
    survey_transect_number,
    image_id,
    image_name,
    image_quality,
    point_id,
    point_no,
    point_machine_classification
  )
  ##   PCODE = "SYNTHETIC-random",
  ##   ID = 1:n(),
  ##   CRUISE_CODE = paste0("SYNTHETIC", Year),
  ##   REEF_NAME = Reef,
  ##   AIMS_REEF_NAME = Reef,
  ##   SECTOR = "synthetic",
  ##   LATITUDE = Latitude,
  ##   LONGITUDE = Longitude,
  ##   SITE_NO = Site,
  ##   TRANSECT_NO = Transect,
  ##   SITE_DEPTH = Depth,
  ##   REEF_ZONE = "-",
  ##   REPORT_YEAR = Year,
  ##   SURVEY_DATE = Date,
  ##   FRAME = paste0(PCODE, "/", REEF_NAME, "/", REEF_ZONE,
  ##     "/", SITE_NO, "/", SITE_DEPTH, "/", TRANSECT_NO,
  ##     "/", REPORT_YEAR, "/", FRAME),
  ##   POINT_NO = POINT_NO,
  ##   FAMILY = NA,
  ##   GROUP_DESC = Group,
  ##   REEFPAGE_CATEGORY = paste0(Group, "_alt")
  ## ) |>
  ## dplyr::select(
  ##   PCODE, ID, CRUISE_CODE, REEF_NAME, AIMS_REEF_NAME, SECTOR,
  ##   LATITUDE, LONGITUDE, SITE_NO, TRANSECT_NO, SITE_DEPTH,
  ##   REEF_ZONE, REPORT_YEAR, SURVEY_DATE, FRAME, POINT_NO,
  ##   FAMILY, GROUP_DESC, REEFPAGE_CATEGORY
  ## )

write_csv(reef_data_synthetic_random,
  file = "../data/reef_data_synthetic_random.csv"
  )
rmarkdown::paged_table(reef_data_synthetic_random |> head())

3 Read in the data

Lets start by reading in the data sets (which were exported as csv files). There are many functions in R that can read in a CSV file. We will use a the read_csv() function as it is part of the tidyverse ecosystem.

data_fixed <- read_csv("../data/reef_data_synthetic_fixed.csv", trim_ws = TRUE)
Rows: 3000027 Columns: 24
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr   (7): project_name, site_name, site_country, site_reef_name, survey_tit...
dbl  (12): project_id, site_id, site_latitude, site_longitude, site_depth, s...
lgl   (4): site_reef_type, site_reef_zone, site_code, site_management
dttm  (1): survey_start_date

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

After reading in a dataset, it is always a good idea to quickly explore a few summaries in order to ascertain whether the imported data are correctly transcribed. In particular, we should pay attention to whether there are any unexpected missing values and ensure that each variable (column) has the expected class (e.g. that variables we expected to be considered numbers are indeed listed as either or and not ).

data_fixed |> glimpse()
Rows: 3,000,027
Columns: 24
$ project_id                   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ project_name                 <chr> "synthetic_fixed", "synthetic_fixed", "sy…
$ site_id                      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ site_name                    <chr> "Reef118 Site 1", "Reef118 Site 1", "Reef…
$ site_latitude                <dbl> -20.27968, -20.27968, -20.27968, -20.2796…
$ site_longitude               <dbl> 3.959314, 3.959314, 3.959314, 3.959314, 3…
$ site_depth                   <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
$ site_country                 <chr> "synthetic Country", "synthetic Country",…
$ site_reef_name               <chr> "Reef118", "Reef118", "Reef118", "Reef118…
$ site_reef_type               <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ site_reef_zone               <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ site_code                    <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ site_management              <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ survey_id                    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ survey_title                 <chr> "Reef118 Site 1 Transect 1 2010-01-01", "…
$ survey_start_date            <dttm> 2010-01-01 04:00:00, 2010-01-01 04:00:00…
$ survey_depth                 <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
$ survey_transect_number       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ image_id                     <dbl> 1, 1, 1, 1, 1, 13, 13, 13, 13, 13, 24, 24…
$ image_name                   <chr> "Reef118 Site 1 Transect 1 2010-01-01 1",…
$ image_quality                <dbl> 100, 100, 100, 100, 100, 100, 100, 100, 1…
$ point_id                     <dbl> 1, 2, 3, 4, 5, 61, 62, 63, 64, 65, 116, 1…
$ point_no                     <dbl> 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4,…
$ point_machine_classification <chr> "MA", "MA", "HCC", "MA", "MA", "MA", "MA"…
data_fixed |> head()
# A tibble: 6 × 24
  project_id project_name    site_id site_name      site_latitude site_longitude
       <dbl> <chr>             <dbl> <chr>                  <dbl>          <dbl>
1          1 synthetic_fixed       1 Reef118 Site 1         -20.3           3.96
2          1 synthetic_fixed       1 Reef118 Site 1         -20.3           3.96
3          1 synthetic_fixed       1 Reef118 Site 1         -20.3           3.96
4          1 synthetic_fixed       1 Reef118 Site 1         -20.3           3.96
5          1 synthetic_fixed       1 Reef118 Site 1         -20.3           3.96
6          1 synthetic_fixed       1 Reef118 Site 1         -20.3           3.96
# ℹ 18 more variables: site_depth <dbl>, site_country <chr>,
#   site_reef_name <chr>, site_reef_type <lgl>, site_reef_zone <lgl>,
#   site_code <lgl>, site_management <lgl>, survey_id <dbl>,
#   survey_title <chr>, survey_start_date <dttm>, survey_depth <dbl>,
#   survey_transect_number <dbl>, image_id <dbl>, image_name <chr>,
#   image_quality <dbl>, point_id <dbl>, point_no <dbl>,
#   point_machine_classification <chr>
data_fixed |> str()
spc_tbl_ [3,000,027 × 24] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ project_id                  : num [1:3000027] 1 1 1 1 1 1 1 1 1 1 ...
 $ project_name                : chr [1:3000027] "synthetic_fixed" "synthetic_fixed" "synthetic_fixed" "synthetic_fixed" ...
 $ site_id                     : num [1:3000027] 1 1 1 1 1 1 1 1 1 1 ...
 $ site_name                   : chr [1:3000027] "Reef118 Site 1" "Reef118 Site 1" "Reef118 Site 1" "Reef118 Site 1" ...
 $ site_latitude               : num [1:3000027] -20.3 -20.3 -20.3 -20.3 -20.3 ...
 $ site_longitude              : num [1:3000027] 3.96 3.96 3.96 3.96 3.96 ...
 $ site_depth                  : num [1:3000027] 3 3 3 3 3 3 3 3 3 3 ...
 $ site_country                : chr [1:3000027] "synthetic Country" "synthetic Country" "synthetic Country" "synthetic Country" ...
 $ site_reef_name              : chr [1:3000027] "Reef118" "Reef118" "Reef118" "Reef118" ...
 $ site_reef_type              : logi [1:3000027] NA NA NA NA NA NA ...
 $ site_reef_zone              : logi [1:3000027] NA NA NA NA NA NA ...
 $ site_code                   : logi [1:3000027] NA NA NA NA NA NA ...
 $ site_management             : logi [1:3000027] NA NA NA NA NA NA ...
 $ survey_id                   : num [1:3000027] 1 1 1 1 1 1 1 1 1 1 ...
 $ survey_title                : chr [1:3000027] "Reef118 Site 1 Transect 1 2010-01-01" "Reef118 Site 1 Transect 1 2010-01-01" "Reef118 Site 1 Transect 1 2010-01-01" "Reef118 Site 1 Transect 1 2010-01-01" ...
 $ survey_start_date           : POSIXct[1:3000027], format: "2010-01-01 04:00:00" "2010-01-01 04:00:00" ...
 $ survey_depth                : num [1:3000027] 3 3 3 3 3 3 3 3 3 3 ...
 $ survey_transect_number      : num [1:3000027] 1 1 1 1 1 1 1 1 1 1 ...
 $ image_id                    : num [1:3000027] 1 1 1 1 1 13 13 13 13 13 ...
 $ image_name                  : chr [1:3000027] "Reef118 Site 1 Transect 1 2010-01-01 1" "Reef118 Site 1 Transect 1 2010-01-01 1" "Reef118 Site 1 Transect 1 2010-01-01 1" "Reef118 Site 1 Transect 1 2010-01-01 1" ...
 $ image_quality               : num [1:3000027] 100 100 100 100 100 100 100 100 100 100 ...
 $ point_id                    : num [1:3000027] 1 2 3 4 5 61 62 63 64 65 ...
 $ point_no                    : num [1:3000027] 1 2 3 4 5 1 2 3 4 5 ...
 $ point_machine_classification: chr [1:3000027] "MA" "MA" "HCC" "MA" ...
 - attr(*, "spec")=
  .. cols(
  ..   project_id = col_double(),
  ..   project_name = col_character(),
  ..   site_id = col_double(),
  ..   site_name = col_character(),
  ..   site_latitude = col_double(),
  ..   site_longitude = col_double(),
  ..   site_depth = col_double(),
  ..   site_country = col_character(),
  ..   site_reef_name = col_character(),
  ..   site_reef_type = col_logical(),
  ..   site_reef_zone = col_logical(),
  ..   site_code = col_logical(),
  ..   site_management = col_logical(),
  ..   survey_id = col_double(),
  ..   survey_title = col_character(),
  ..   survey_start_date = col_datetime(format = ""),
  ..   survey_depth = col_double(),
  ..   survey_transect_number = col_double(),
  ..   image_id = col_double(),
  ..   image_name = col_character(),
  ..   image_quality = col_double(),
  ..   point_id = col_double(),
  ..   point_no = col_double(),
  ..   point_machine_classification = col_character()
  .. )
 - attr(*, "problems")=<externalptr> 
data_fixed |>
  datawizard::data_codebook() |>
  knitr::kable()
Warning: Following 4 columns were empty and have been removed:
  site_reef_type, site_reef_zone, site_code and site_management
ID Name Type Missings Values N Prop .row_id
1 project_id numeric 0 (0.0%) 1 3000027 100.0% 1
1
2 project_name character 0 (0.0%) synthetic_fixed 3000027 100.0% 2
2
3 site_id numeric 0 (0.0%) [1, 50] 3000027 3
3
4 site_name character 0 (0.0%) Reef118 Site 1 59992 2.0% 4
Reef118 Site 2 59999 2.0% 4
Reef137 Site 1 60000 2.0% 4
Reef137 Site 2 59998 2.0% 4
Reef14 Site 1 60006 2.0% 4
Reef14 Site 2 60002 2.0% 4
Reef153 Site 1 60013 2.0% 4
Reef153 Site 2 60007 2.0% 4
Reef159 Site 1 59999 2.0% 4
Reef159 Site 2 60004 2.0% 4
(…) NA NA 4
4
5 site_latitude numeric 0 (0.0%) [-20.69, -10.32] 3000027 5
5
6 site_longitude numeric 0 (0.0%) [0.86, 9.04] 3000027 6
6
7 site_depth numeric 0 (0.0%) 3 1500009 50.0% 7
10 1500018 50.0% 7
7
8 site_country character 0 (0.0%) synthetic Country 3000027 100.0% 8
8
9 site_reef_name character 0 (0.0%) Reef118 119991 4.0% 9
Reef137 119998 4.0% 9
Reef14 120008 4.0% 9
Reef153 120020 4.0% 9
Reef159 120003 4.0% 9
Reef170 119996 4.0% 9
Reef179 120003 4.0% 9
Reef185 119993 4.0% 9
Reef195 119995 4.0% 9
Reef197 119997 4.0% 9
(…) NA NA 9
9
14 survey_id numeric 0 (0.0%) [1, 3000] 3000027 10
10
15 survey_title character 0 (0.0%) Reef118 Site 1 Transect 1 2010-01-01 1000 0.0% 11
Reef118 Site 1 Transect 1 2011-01-01 1000 0.0% 11
Reef118 Site 1 Transect 1 2012-01-01 999 0.0% 11
Reef118 Site 1 Transect 1 2013-01-01 1001 0.0% 11
Reef118 Site 1 Transect 1 2014-01-01 1000 0.0% 11
Reef118 Site 1 Transect 1 2015-01-01 1000 0.0% 11
Reef118 Site 1 Transect 1 2016-01-01 1000 0.0% 11
Reef118 Site 1 Transect 1 2017-01-01 1000 0.0% 11
Reef118 Site 1 Transect 1 2018-01-01 1001 0.0% 11
Reef118 Site 1 Transect 1 2019-01-01 999 0.0% 11
(…) NA NA 11
11
16 survey_start_date numeric 0 (0.0%) 2010-01-01 04:00:00 249997 8.3% 12
2011-01-01 04:00:00 250008 8.3% 12
2012-01-01 04:00:00 250004 8.3% 12
2013-01-01 04:00:00 249990 8.3% 12
2014-01-01 04:00:00 250023 8.3% 12
2015-01-01 04:00:00 250000 8.3% 12
2016-01-01 04:00:00 250019 8.3% 12
2017-01-01 04:00:00 249990 8.3% 12
2018-01-01 04:00:00 249999 8.3% 12
2019-01-01 04:00:00 250014 8.3% 12
(…) NA NA 12
12
17 survey_depth numeric 0 (0.0%) 3 1500009 50.0% 13
10 1500018 50.0% 13
13
18 survey_transect_number numeric 0 (0.0%) 1 600012 20.0% 14
2 600003 20.0% 14
3 600007 20.0% 14
4 599997 20.0% 14
5 600008 20.0% 14
14
19 image_id numeric 0 (0.0%) [1, 300000] 3000027 15
15
20 image_name character 0 (0.0%) Reef118 Site 1 Transect 1 2010-01-01 1 10 0.0% 16
Reef118 Site 1 Transect 1 2010-01-01 10 10 0.0% 16
Reef118 Site 1 Transect 1 2010-01-01 100 10 0.0% 16
Reef118 Site 1 Transect 1 2010-01-01 11 10 0.0% 16
Reef118 Site 1 Transect 1 2010-01-01 12 10 0.0% 16
Reef118 Site 1 Transect 1 2010-01-01 13 10 0.0% 16
Reef118 Site 1 Transect 1 2010-01-01 14 10 0.0% 16
Reef118 Site 1 Transect 1 2010-01-01 15 10 0.0% 16
Reef118 Site 1 Transect 1 2010-01-01 16 10 0.0% 16
Reef118 Site 1 Transect 1 2010-01-01 17 10 0.0% 16
(…) NA NA 16
16
21 image_quality numeric 0 (0.0%) 100 3000027 100.0% 17
17
22 point_id numeric 0 (0.0%) [1, 1.49996e+06] 3000027 18
18
23 point_no numeric 0 (0.0%) 1 600746 20.0% 19
2 600000 20.0% 19
3 600000 20.0% 19
4 600000 20.0% 19
5 599281 20.0% 19
19
24 point_machine_classification character 0 (0.0%) HCC 986690 32.9% 20
MA 1844885 61.5% 20
SC 168452 5.6% 20
20
data_random <- read_csv("../data/reef_data_synthetic_random.csv", trim_ws = TRUE)
Rows: 2999943 Columns: 24
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr   (7): project_name, site_name, site_country, site_reef_name, survey_tit...
dbl  (12): project_id, site_id, site_latitude, site_longitude, site_depth, s...
lgl   (4): site_reef_type, site_reef_zone, site_code, site_management
dttm  (1): survey_start_date

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

After reading in a dataset, it is always a good idea to quickly explore a few summaries in order to ascertain whether the imported data are correctly transcribed. In particular, we should pay attention to whether there are any unexpected missing values and ensure that each variable (column) has the expected class (e.g. that variables we expected to be considered numbers are indeed listed as either or and not ).

data_random |> glimpse()
Rows: 2,999,943
Columns: 24
$ project_id                   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ project_name                 <chr> "synthetic_fixed", "synthetic_fixed", "sy…
$ site_id                      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ site_name                    <chr> "Reef1 Site 1", "Reef1 Site 1", "Reef1 Si…
$ site_latitude                <dbl> -19.90968, -19.90968, -19.90968, -19.9096…
$ site_longitude               <dbl> 9.819314, 9.819314, 9.819314, 9.819314, 9…
$ site_depth                   <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
$ site_country                 <chr> "synthetic Country", "synthetic Country",…
$ site_reef_name               <chr> "Reef1", "Reef1", "Reef1", "Reef1", "Reef…
$ site_reef_type               <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ site_reef_zone               <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ site_code                    <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ site_management              <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ survey_id                    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ survey_title                 <chr> "Reef1 Site 1 Transect 1 2016-01-01", "Re…
$ survey_start_date            <dttm> 2016-01-01 04:00:00, 2016-01-01 04:00:00…
$ survey_depth                 <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
$ survey_transect_number       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ image_id                     <dbl> 1, 1, 1, 1, 1, 13, 13, 13, 13, 13, 24, 24…
$ image_name                   <chr> "Reef1 Site 1 Transect 1 2016-01-01 1", "…
$ image_quality                <dbl> 100, 100, 100, 100, 100, 100, 100, 100, 1…
$ point_id                     <dbl> 1, 2, 3, 4, 5, 61, 62, 63, 64, 65, 116, 1…
$ point_no                     <dbl> 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4,…
$ point_machine_classification <chr> "SC", "HCC", "MA", "HCC", "HCC", "HCC", "…
data_random |> head()
# A tibble: 6 × 24
  project_id project_name    site_id site_name    site_latitude site_longitude
       <dbl> <chr>             <dbl> <chr>                <dbl>          <dbl>
1          1 synthetic_fixed       1 Reef1 Site 1         -19.9           9.82
2          1 synthetic_fixed       1 Reef1 Site 1         -19.9           9.82
3          1 synthetic_fixed       1 Reef1 Site 1         -19.9           9.82
4          1 synthetic_fixed       1 Reef1 Site 1         -19.9           9.82
5          1 synthetic_fixed       1 Reef1 Site 1         -19.9           9.82
6          1 synthetic_fixed       1 Reef1 Site 1         -19.9           9.82
# ℹ 18 more variables: site_depth <dbl>, site_country <chr>,
#   site_reef_name <chr>, site_reef_type <lgl>, site_reef_zone <lgl>,
#   site_code <lgl>, site_management <lgl>, survey_id <dbl>,
#   survey_title <chr>, survey_start_date <dttm>, survey_depth <dbl>,
#   survey_transect_number <dbl>, image_id <dbl>, image_name <chr>,
#   image_quality <dbl>, point_id <dbl>, point_no <dbl>,
#   point_machine_classification <chr>
data_random |> str()
spc_tbl_ [2,999,943 × 24] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ project_id                  : num [1:2999943] 1 1 1 1 1 1 1 1 1 1 ...
 $ project_name                : chr [1:2999943] "synthetic_fixed" "synthetic_fixed" "synthetic_fixed" "synthetic_fixed" ...
 $ site_id                     : num [1:2999943] 1 1 1 1 1 1 1 1 1 1 ...
 $ site_name                   : chr [1:2999943] "Reef1 Site 1" "Reef1 Site 1" "Reef1 Site 1" "Reef1 Site 1" ...
 $ site_latitude               : num [1:2999943] -19.9 -19.9 -19.9 -19.9 -19.9 ...
 $ site_longitude              : num [1:2999943] 9.82 9.82 9.82 9.82 9.82 ...
 $ site_depth                  : num [1:2999943] 3 3 3 3 3 3 3 3 3 3 ...
 $ site_country                : chr [1:2999943] "synthetic Country" "synthetic Country" "synthetic Country" "synthetic Country" ...
 $ site_reef_name              : chr [1:2999943] "Reef1" "Reef1" "Reef1" "Reef1" ...
 $ site_reef_type              : logi [1:2999943] NA NA NA NA NA NA ...
 $ site_reef_zone              : logi [1:2999943] NA NA NA NA NA NA ...
 $ site_code                   : logi [1:2999943] NA NA NA NA NA NA ...
 $ site_management             : logi [1:2999943] NA NA NA NA NA NA ...
 $ survey_id                   : num [1:2999943] 1 1 1 1 1 1 1 1 1 1 ...
 $ survey_title                : chr [1:2999943] "Reef1 Site 1 Transect 1 2016-01-01" "Reef1 Site 1 Transect 1 2016-01-01" "Reef1 Site 1 Transect 1 2016-01-01" "Reef1 Site 1 Transect 1 2016-01-01" ...
 $ survey_start_date           : POSIXct[1:2999943], format: "2016-01-01 04:00:00" "2016-01-01 04:00:00" ...
 $ survey_depth                : num [1:2999943] 3 3 3 3 3 3 3 3 3 3 ...
 $ survey_transect_number      : num [1:2999943] 1 1 1 1 1 1 1 1 1 1 ...
 $ image_id                    : num [1:2999943] 1 1 1 1 1 13 13 13 13 13 ...
 $ image_name                  : chr [1:2999943] "Reef1 Site 1 Transect 1 2016-01-01 1" "Reef1 Site 1 Transect 1 2016-01-01 1" "Reef1 Site 1 Transect 1 2016-01-01 1" "Reef1 Site 1 Transect 1 2016-01-01 1" ...
 $ image_quality               : num [1:2999943] 100 100 100 100 100 100 100 100 100 100 ...
 $ point_id                    : num [1:2999943] 1 2 3 4 5 61 62 63 64 65 ...
 $ point_no                    : num [1:2999943] 1 2 3 4 5 1 2 3 4 5 ...
 $ point_machine_classification: chr [1:2999943] "SC" "HCC" "MA" "HCC" ...
 - attr(*, "spec")=
  .. cols(
  ..   project_id = col_double(),
  ..   project_name = col_character(),
  ..   site_id = col_double(),
  ..   site_name = col_character(),
  ..   site_latitude = col_double(),
  ..   site_longitude = col_double(),
  ..   site_depth = col_double(),
  ..   site_country = col_character(),
  ..   site_reef_name = col_character(),
  ..   site_reef_type = col_logical(),
  ..   site_reef_zone = col_logical(),
  ..   site_code = col_logical(),
  ..   site_management = col_logical(),
  ..   survey_id = col_double(),
  ..   survey_title = col_character(),
  ..   survey_start_date = col_datetime(format = ""),
  ..   survey_depth = col_double(),
  ..   survey_transect_number = col_double(),
  ..   image_id = col_double(),
  ..   image_name = col_character(),
  ..   image_quality = col_double(),
  ..   point_id = col_double(),
  ..   point_no = col_double(),
  ..   point_machine_classification = col_character()
  .. )
 - attr(*, "problems")=<externalptr> 
data_random |>
  datawizard::data_codebook() |>
  knitr::kable()
Warning: Following 4 columns were empty and have been removed:
  site_reef_type, site_reef_zone, site_code and site_management
ID Name Type Missings Values N Prop .row_id
1 project_id numeric 0 (0.0%) 1 2999943 100.0% 1
1
2 project_name character 0 (0.0%) synthetic_fixed 2999943 100.0% 2
2
3 site_id numeric 0 (0.0%) [1, 350] 2999943 3
3
4 site_name character 0 (0.0%) Reef1 Site 1 4998 0.2% 4
Reef1 Site 2 5000 0.2% 4
Reef10 Site 1 10000 0.3% 4
Reef10 Site 2 10001 0.3% 4
Reef101 Site 1 5000 0.2% 4
Reef101 Site 2 4999 0.2% 4
Reef102 Site 1 4999 0.2% 4
Reef102 Site 2 5002 0.2% 4
Reef103 Site 1 4999 0.2% 4
Reef103 Site 2 5000 0.2% 4
(…) NA NA 4
4
5 site_latitude numeric 0 (0.0%) [-20.99, -10.25] 2999943 5
5
6 site_longitude numeric 0 (0.0%) [0.34, 9.82] 2999943 6
6
7 site_depth numeric 0 (0.0%) 3 1499995 50.0% 7
10 1499948 50.0% 7
7
8 site_country character 0 (0.0%) synthetic Country 2999943 100.0% 8
8
9 site_reef_name character 0 (0.0%) Reef1 9998 0.3% 9
Reef10 20001 0.7% 9
Reef101 9999 0.3% 9
Reef102 10001 0.3% 9
Reef103 9999 0.3% 9
Reef104 10001 0.3% 9
Reef105 10002 0.3% 9
Reef106 20002 0.7% 9
Reef107 30001 1.0% 9
Reef108 9999 0.3% 9
(…) NA NA 9
9
14 survey_id numeric 0 (0.0%) [1, 3000] 2999943 10
10
15 survey_title character 0 (0.0%) Reef1 Site 1 Transect 1 2016-01-01 1000 0.0% 11
Reef1 Site 1 Transect 2 2016-01-01 999 0.0% 11
Reef1 Site 1 Transect 3 2016-01-01 998 0.0% 11
Reef1 Site 1 Transect 4 2016-01-01 1001 0.0% 11
Reef1 Site 1 Transect 5 2016-01-01 1000 0.0% 11
Reef1 Site 2 Transect 1 2016-01-01 1000 0.0% 11
Reef1 Site 2 Transect 2 2016-01-01 1000 0.0% 11
Reef1 Site 2 Transect 3 2016-01-01 1000 0.0% 11
Reef1 Site 2 Transect 4 2016-01-01 1000 0.0% 11
Reef1 Site 2 Transect 5 2016-01-01 1000 0.0% 11
(…) NA NA 11
11
16 survey_start_date numeric 0 (0.0%) 2010-01-01 04:00:00 250001 8.3% 12
2011-01-01 04:00:00 249987 8.3% 12
2012-01-01 04:00:00 249996 8.3% 12
2013-01-01 04:00:00 249986 8.3% 12
2014-01-01 04:00:00 249997 8.3% 12
2015-01-01 04:00:00 250021 8.3% 12
2016-01-01 04:00:00 249986 8.3% 12
2017-01-01 04:00:00 250007 8.3% 12
2018-01-01 04:00:00 249995 8.3% 12
2019-01-01 04:00:00 250007 8.3% 12
(…) NA NA 12
12
17 survey_depth numeric 0 (0.0%) 3 1499995 50.0% 13
10 1499948 50.0% 13
13
18 survey_transect_number numeric 0 (0.0%) 1 599986 20.0% 14
2 599992 20.0% 14
3 599980 20.0% 14
4 599992 20.0% 14
5 599993 20.0% 14
14
19 image_id numeric 0 (0.0%) [1, 300000] 2999943 15
15
20 image_name character 0 (0.0%) Reef1 Site 1 Transect 1 2016-01-01 1 10 0.0% 16
Reef1 Site 1 Transect 1 2016-01-01 10 10 0.0% 16
Reef1 Site 1 Transect 1 2016-01-01 100 10 0.0% 16
Reef1 Site 1 Transect 1 2016-01-01 11 10 0.0% 16
Reef1 Site 1 Transect 1 2016-01-01 12 10 0.0% 16
Reef1 Site 1 Transect 1 2016-01-01 13 10 0.0% 16
Reef1 Site 1 Transect 1 2016-01-01 14 10 0.0% 16
Reef1 Site 1 Transect 1 2016-01-01 15 10 0.0% 16
Reef1 Site 1 Transect 1 2016-01-01 16 10 0.0% 16
Reef1 Site 1 Transect 1 2016-01-01 17 10 0.0% 16
(…) NA NA 16
16
21 image_quality numeric 0 (0.0%) 100 2999943 100.0% 17
17
22 point_id numeric 0 (0.0%) [1, 1.49994e+06] 2999943 18
18
23 point_no numeric 0 (0.0%) 1 600714 20.0% 19
2 600000 20.0% 19
3 600000 20.0% 19
4 600000 20.0% 19
5 599229 20.0% 19
19
24 point_machine_classification character 0 (0.0%) HCC 1046658 34.9% 20
MA 1776443 59.2% 20
SC 176842 5.9% 20
20

4 Excluding extraneous fields

As these are synthetic data, not all the typical reefCloud fields are present. Nevertheless, there are still a large number of fields (columns) in this dataset, many of which we are going to ignore for this exercise. The important fields are:

  • site_id - a unique identifier of the site
  • site_name - a unique name of the site
  • site latitude - latitude of the site
  • site_longitude - longitude of the site
  • survey_start_date - date (and time) of survey
  • survey_depth - depth at which the survey took place
  • survey_transect_number - unique identifier of the transect
  • image_id - unique identifier of the image
  • image_quality - indication of the quality of the image
  • point_id - unique identifier of the point
  • point_num - the number of the point within the image
  • point_machine_classification - classification determined by AI

Although it is often harmless enough to retain the other fields, it does make reviewing the data more combersum, so at an early stage within this exercise, we will probably restrict the data to just the above fields.

data_fixed <- data_fixed |>
  dplyr::select(site_id,
    site_name,
    site_latitude,
    site_longitude,
    survey_start_date,
    survey_depth,
    survey_transect_number,
    image_id,
    image_quality,
    point_id,
    point_no,
    point_machine_classification
    )
data_fixed |> head()
# A tibble: 6 × 12
  site_id site_name      site_latitude site_longitude survey_start_date  
    <dbl> <chr>                  <dbl>          <dbl> <dttm>             
1       1 Reef118 Site 1         -20.3           3.96 2010-01-01 04:00:00
2       1 Reef118 Site 1         -20.3           3.96 2010-01-01 04:00:00
3       1 Reef118 Site 1         -20.3           3.96 2010-01-01 04:00:00
4       1 Reef118 Site 1         -20.3           3.96 2010-01-01 04:00:00
5       1 Reef118 Site 1         -20.3           3.96 2010-01-01 04:00:00
6       1 Reef118 Site 1         -20.3           3.96 2010-01-01 04:00:00
# ℹ 7 more variables: survey_depth <dbl>, survey_transect_number <dbl>,
#   image_id <dbl>, image_quality <dbl>, point_id <dbl>, point_no <dbl>,
#   point_machine_classification <chr>
data_random <- data_random |>
  dplyr::select(site_id,
    site_name,
    site_latitude,
    site_longitude,
    survey_start_date,
    survey_depth,
    survey_transect_number,
    image_id,
    image_quality,
    point_id,
    point_no,
    point_machine_classification
    )
data_random |> head()
# A tibble: 6 × 12
  site_id site_name    site_latitude site_longitude survey_start_date  
    <dbl> <chr>                <dbl>          <dbl> <dttm>             
1       1 Reef1 Site 1         -19.9           9.82 2016-01-01 04:00:00
2       1 Reef1 Site 1         -19.9           9.82 2016-01-01 04:00:00
3       1 Reef1 Site 1         -19.9           9.82 2016-01-01 04:00:00
4       1 Reef1 Site 1         -19.9           9.82 2016-01-01 04:00:00
5       1 Reef1 Site 1         -19.9           9.82 2016-01-01 04:00:00
6       1 Reef1 Site 1         -19.9           9.82 2016-01-01 04:00:00
# ℹ 7 more variables: survey_depth <dbl>, survey_transect_number <dbl>,
#   image_id <dbl>, image_quality <dbl>, point_id <dbl>, point_no <dbl>,
#   point_machine_classification <chr>

5 Excluding poor images

data_fixed <-
  data_fixed |>
  dplyr::filter(is.na(image_quality) | image_quality != 0)  
data_random <-
  data_random |>
  dplyr::filter(is.na(image_quality) | image_quality != 0)  

6 Lengthen the data

To facilitate most graphical and statistical modelling routines, data must be structured such that each row represents an individual record and that the variables are in columns.

data_fixed <-
  data_fixed |>
  pivot_longer(cols = matches("point_.*_classification"),
    names_to = "type",
    values_to = "classification"
    ) 
data_random <-
  data_random |>
  pivot_longer(cols = matches("point_.*_classification"),
    names_to = "type",
    values_to = "classification"
    ) 

7 Joining to the code group lookup data

Within reefCloud, the taxonomic resolution of point classification depends on the granularity of training label sets. For many analyses (such as spatio-temporal modelling of hard coral cover), this is overly granular. In order to group the taxonomic levels up to the larger groups (such as hard coral, soft coral and macroalgae), it is necessary to join the data to a lookup table representing the labelsets. However, the current synthetic data were only constructed to the broad categories (hard coral, soft coral and macroalgae) in the first place.

Although the current data are already recorded in the desired taxonomic resolution, for code completeness, we will still join in the labelset data (which we will first generate here).

labelset <- tribble(
  ~CODE, ~DESCRIPTION, ~"FUNCTIONAL GROUP", ~"KEYBOARD SHORTCUT CODE",
  "HCC", "Hard coral", "Hard coral", NA,
  "SC", "Soft coral", "Soft coral", NA,
  "MA", "Macroalgae", "Macroalgae", NA
)
write_csv(labelset, file = "../data/labelset.csv")
labelset <- read_csv("../data/labelset.csv", trim_ws = TRUE)
Rows: 3 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): CODE, DESCRIPTION, FUNCTIONAL GROUP
lgl (1): KEYBOARD SHORTCUT CODE

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
labelset |> glimpse()
Rows: 3
Columns: 4
$ CODE                     <chr> "HCC", "SC", "MA"
$ DESCRIPTION              <chr> "Hard coral", "Soft coral", "Macroalgae"
$ `FUNCTIONAL GROUP`       <chr> "Hard coral", "Soft coral", "Macroalgae"
$ `KEYBOARD SHORTCUT CODE` <lgl> NA, NA, NA
labelset |> head()
# A tibble: 3 × 4
  CODE  DESCRIPTION `FUNCTIONAL GROUP` `KEYBOARD SHORTCUT CODE`
  <chr> <chr>       <chr>              <lgl>                   
1 HCC   Hard coral  Hard coral         NA                      
2 SC    Soft coral  Soft coral         NA                      
3 MA    Macroalgae  Macroalgae         NA                      
labelset |> str()
spc_tbl_ [3 × 4] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ CODE                  : chr [1:3] "HCC" "SC" "MA"
 $ DESCRIPTION           : chr [1:3] "Hard coral" "Soft coral" "Macroalgae"
 $ FUNCTIONAL GROUP      : chr [1:3] "Hard coral" "Soft coral" "Macroalgae"
 $ KEYBOARD SHORTCUT CODE: logi [1:3] NA NA NA
 - attr(*, "spec")=
  .. cols(
  ..   CODE = col_character(),
  ..   DESCRIPTION = col_character(),
  ..   `FUNCTIONAL GROUP` = col_character(),
  ..   `KEYBOARD SHORTCUT CODE` = col_logical()
  .. )
 - attr(*, "problems")=<externalptr> 
labelset |>
  datawizard::data_codebook() |>
  knitr::kable()
Warning: Following 1 columns were empty and have been removed:
  KEYBOARD SHORTCUT CODE
ID Name Type Missings Values N Prop .row_id
1 CODE character 0 (0.0%) HCC 1 33.3% 1
MA 1 33.3% 1
SC 1 33.3% 1
1
2 DESCRIPTION character 0 (0.0%) Hard coral 1 33.3% 2
Macroalgae 1 33.3% 2
Soft coral 1 33.3% 2
2
3 FUNCTIONAL GROUP character 0 (0.0%) Hard coral 1 33.3% 3
Macroalgae 1 33.3% 3
Soft coral 1 33.3% 3
3
data_fixed <-
  data_fixed |>
  left_join(labelset |>
              dplyr::select(CODE, GROUP = `FUNCTIONAL GROUP`),
              by = c("classification" = "CODE")
    )
data_fixed |> as.data.frame() |> head() 
  site_id      site_name site_latitude site_longitude   survey_start_date
1       1 Reef118 Site 1     -20.27968       3.959314 2010-01-01 04:00:00
2       1 Reef118 Site 1     -20.27968       3.959314 2010-01-01 04:00:00
3       1 Reef118 Site 1     -20.27968       3.959314 2010-01-01 04:00:00
4       1 Reef118 Site 1     -20.27968       3.959314 2010-01-01 04:00:00
5       1 Reef118 Site 1     -20.27968       3.959314 2010-01-01 04:00:00
6       1 Reef118 Site 1     -20.27968       3.959314 2010-01-01 04:00:00
  survey_depth survey_transect_number image_id image_quality point_id point_no
1            3                      1        1           100        1        1
2            3                      1        1           100        2        2
3            3                      1        1           100        3        3
4            3                      1        1           100        4        4
5            3                      1        1           100        5        5
6            3                      1       13           100       61        1
                          type classification      GROUP
1 point_machine_classification             MA Macroalgae
2 point_machine_classification             MA Macroalgae
3 point_machine_classification            HCC Hard coral
4 point_machine_classification             MA Macroalgae
5 point_machine_classification             MA Macroalgae
6 point_machine_classification             MA Macroalgae
data_random <-
  data_random |>
  left_join(labelset |>
              dplyr::select(CODE, GROUP = `FUNCTIONAL GROUP`),
              by = c("classification" = "CODE")
    )
data_random |> as.data.frame() |> head() 
  site_id    site_name site_latitude site_longitude   survey_start_date
1       1 Reef1 Site 1     -19.90968       9.819314 2016-01-01 04:00:00
2       1 Reef1 Site 1     -19.90968       9.819314 2016-01-01 04:00:00
3       1 Reef1 Site 1     -19.90968       9.819314 2016-01-01 04:00:00
4       1 Reef1 Site 1     -19.90968       9.819314 2016-01-01 04:00:00
5       1 Reef1 Site 1     -19.90968       9.819314 2016-01-01 04:00:00
6       1 Reef1 Site 1     -19.90968       9.819314 2016-01-01 04:00:00
  survey_depth survey_transect_number image_id image_quality point_id point_no
1            3                      1        1           100        1        1
2            3                      1        1           100        2        2
3            3                      1        1           100        3        3
4            3                      1        1           100        4        4
5            3                      1        1           100        5        5
6            3                      1       13           100       61        1
                          type classification      GROUP
1 point_machine_classification             SC Soft coral
2 point_machine_classification            HCC Hard coral
3 point_machine_classification             MA Macroalgae
4 point_machine_classification            HCC Hard coral
5 point_machine_classification            HCC Hard coral
6 point_machine_classification            HCC Hard coral

8 Tally up points

Count the number of points of each type as well as sum up the total number of points per image.

data_fixed <- 
  data_fixed |> 
  group_by(across(c(starts_with("site"),
    starts_with("survey"),
    type,
    image_id,
    GROUP))
  ) |>
  summarise(COUNT = n(), .groups = "keep") |> 
  ungroup(GROUP) |>
  mutate(TOTAL = sum(COUNT)) |>
  ungroup() 
data_random <- 
  data_random |> 
  group_by(across(c(starts_with("site"),
    starts_with("survey"),
    type,
    image_id,
    GROUP))
  ) |>
  summarise(COUNT = n(), .groups = "keep") |> 
  ungroup(GROUP) |>
  mutate(TOTAL = sum(COUNT)) |>
  ungroup() 

9 Recode transects

data_fixed <- 
  data_fixed |>
  mutate(transect_id = paste0(site_id, survey_depth, survey_transect_number)) 
data_random <- 
  data_random |>
  mutate(transect_id = paste0(site_id, survey_depth, survey_transect_number)) 

10 Fill in any gaps

Since the data represent the classification of points in images, they only include what was present, not what was also absent. For example, if all five points are Algae, then this also means that all other functional groups are absent - yet this information is lacking in the data. For modelling purposes it is vital that we fill in all the zero values.

To do so, we must create a data set that contains every GROUP in every IMAGE.

GROUPS <- data_fixed |> pull(GROUP) |> unique()
data.filler <- data_fixed |> 
  dplyr::select(
    starts_with("site"),
    survey_start_date,
    survey_depth,
    transect_id,
    image_id,
    type,
    TOTAL) |> 
  distinct() |> 
 tidyr::crossing(GROUP = GROUPS) 

data_fixed <-
  data_fixed |> 
  full_join(data.filler) |>
  group_by(
    across(c(starts_with("site"),
      survey_start_date,
      survey_depth,
      transect_id,
      image_id,
      type,
      GROUP
    ))) |> 
  mutate(COUNT = ifelse(is.na(COUNT), 0, COUNT),
    TOTAL = max(TOTAL, na.rm = TRUE)
  ) 
Joining with `by = join_by(site_id, site_name, site_latitude, site_longitude,
survey_start_date, survey_depth, type, image_id, GROUP, TOTAL, transect_id)`
GROUPS <- data_random |> pull(GROUP) |> unique()
data.filler <- data_random |> 
  dplyr::select(
    starts_with("site"),
    survey_start_date,
    survey_depth,
    transect_id,
    image_id,
    type,
    TOTAL) |> 
  distinct() |> 
 tidyr::crossing(GROUP = GROUPS) 

data_random <-
  data_random |> 
  full_join(data.filler) |>
  group_by(
    across(c(starts_with("site"),
      survey_start_date,
      survey_depth,
      transect_id,
      image_id,
      type,
      GROUP
    ))) |> 
  mutate(COUNT = ifelse(is.na(COUNT), 0, COUNT),
    TOTAL = max(TOTAL, na.rm = TRUE)
  )
Joining with `by = join_by(site_id, site_name, site_latitude, site_longitude,
survey_start_date, survey_depth, type, image_id, GROUP, TOTAL, transect_id)`

11 Sum to transect level

data_fixed <- 
  data_fixed |>
  ungroup(image_id) |>
  summarise(COUNT = sum(COUNT),
    TOTAL = sum(TOTAL)
  ) |> 
  ungroup() |> 
  droplevels()
`summarise()` has grouped output by 'site_id', 'site_name', 'site_latitude',
'site_longitude', 'survey_start_date', 'survey_depth', 'transect_id', 'type'.
You can override using the `.groups` argument.
data_random <- 
  data_random |>
  ungroup(image_id) |>
  summarise(COUNT = sum(COUNT),
    TOTAL = sum(TOTAL)
  ) |> 
  ungroup() |> 
  droplevels()
`summarise()` has grouped output by 'site_id', 'site_name', 'site_latitude',
'site_longitude', 'survey_start_date', 'survey_depth', 'transect_id', 'type'.
You can override using the `.groups` argument.

12 Generate a year field

data_fixed <-
  data_fixed |>
  mutate(Year = lubridate::year(survey_start_date),
    TropYear = lubridate::year(survey_start_date + months(3))
  ) 
data_random <-
  data_random |>
  mutate(Year = lubridate::year(survey_start_date),
    TropYear = lubridate::year(survey_start_date + months(3))
  ) 

13 Generate a reef id

data_fixed <-
  data_fixed |>
  mutate(Reef_id = str_replace(site_name, "(.*) Site.*", "\\1"))
data_random <-
  data_random |>
  mutate(Reef_id = str_replace(site_name, "(.*) Site.*", "\\1"))

14 Visualisations

data_fixed |>
  filter(type == "point_machine_classification", GROUP == "Hard coral") |> 
  ggplot(aes(y =  COUNT/TOTAL, x = survey_start_date, colour = factor(survey_depth))) +
  geom_point() +
  geom_line(aes(group = transect_id)) + 
  facet_wrap(~Reef_id + site_name)

data_random |>
  filter(type == "point_machine_classification", GROUP == "Hard coral") |> 
  ggplot(aes(y =  COUNT/TOTAL, x = survey_start_date, colour = factor(survey_depth))) +
  geom_point() 

  ## geom_line(aes(group = transect_id)) + 
  ## facet_wrap(~Reef_id + site_name)

The end