Preparation of reefCloud data

Author

Murray Logan

Published

March 31, 2024

1 Synopsis

In the previous tutorial, we created synthetic reefCloud data. In the current tutorial, we will prepare these data for statistical analyses. Recall that we created two data sets, one representing a fixed sampling design, the other representing a random sampling design. We will prepare both of these data sets.

Necessary wrangling (preparation) steps:

exclude extraneous (unneeded) fields
exclude poor images
lengthen the data with respect to classification type
join to a labelset lookup
tally up the points per date/image/GROUP/type
recode transect id
fill in the gaps and add the zeros
sum to transect level
generate a Year field from the sample date

2 Preparations

We will start by loading the required r packages.

library(knitr)
library(tidyverse)
library(easystats)
library(sf)

This tutorial will prepare the data sets generated at the end of the previous tutorials. These datasets (reef_data_synthetic_fixed.csv and reef_data_synthetic_random.csv) are reasonably large (take up substantial disk space on a repository). In order to keep the repository containing these tutorials to a manageable size, rather than track the final, large data sets, I have instead versioned the much smaller penultimate data sets (data_fixed_locs_obs.RData and data_random_locs_obs.RData). Hence, before starting this tutorial, we will first read in these penultimate data sets and repeat the final preparation and writing steps of the last tutorial. If you are working through these tutorials in sequential order and already have created the final data sets, there is no need to repeat this step.

View code

Number_of_transects_per_site <- 5
Depths <- 2
Number_of_frames_per_transect <- 100
Points_per_frame <- 5


## Note, the following are on the link scale
hcc_site_sigma <- 0.5        # variability in Sites within Locations
hcc_transect_sigma <- 0.2    # variability in Transects within Sites
hcc_sigma <- 0.1             # random noise

sc_site_sigma <- 0.05        # variability in Sites within Locations
sc_transect_sigma <- 0.02    # variability in Transects within Sites
sc_sigma <- 0.01             # random noise

ma_site_sigma <- 0.5        # variability in Sites within Locations
ma_transect_sigma <- 0.2    # variability in Transects within Sites
ma_sigma <- 0.1             # random noise

Fixed design

## Need to split the percentage cover into point and frames
load(file = "../data/data_fixed_locs_obs.RData")
data_fixed_locs_obs <- data_fixed_locs_obs |>
  group_by(Reef,Site,Transect,Year,Depth,Date) |>
  mutate(Points = round(Number_of_frames_per_transect *
                          Points_per_frame *
                          (Value/sum(Value)),0),
    Points = ifelse(Points<0, 0, Points)) |>
  tidyr::uncount(Points) |>
  sample_n(n(), replace=FALSE) |>
  mutate(POINT_NO = rep_len(1:Points_per_frame, length = n()),
    ## FRAME = 1 + cumsum(POINT_NO) %/% (sum(1:Points_per_frame) + 1e-10)) |>
    FRAME = rep(1:Number_of_frames_per_transect, each=Points_per_frame, length = n())) |>
  ungroup() 

## a |> group_by(Reef, Site, Transect, Year, Depth, Group) |>
##     summarise(Count = n()) |>
##     ungroup(Group) |>
##     mutate(Total=sum(Count),
##            Cover = Count/Total)

reef_data_synthetic_fixed <-
  data_fixed_locs_obs |>
  mutate(
    project_id = 1,
    project_name = "synthetic_fixed",
    SITE_NO = str_replace(Site, "^S", "Site "),
    TRANSECT_NO = str_replace(Transect, "^T", "Transect "),
    site_name = factor(paste(Reef, SITE_NO)),
    site_id = as.numeric(site_name),
    site_latitude = Latitude,
    site_longitude = Longitude,
    site_depth = Depth,
    site_country = "synthetic Country",
    site_reef_name = factor(Reef),
    site_reef_type = NA,
    site_reef_zone = NA,
    site_code = NA,
    site_management = NA,
    survey_title = factor(paste(Reef, SITE_NO, TRANSECT_NO, format(Date, "%Y-%m-%d"))),
    survey_id = as.numeric(survey_title),
    survey_start_date = Date,
    survey_depth = Depth,
    survey_transect_number = as.numeric(str_replace(TRANSECT_NO, "Transect ", "")),
    image_name = factor(paste(survey_title, FRAME)),
    image_id = as.numeric(image_name),
    image_quality = 100,
    point_no = POINT_NO,
    point_id = as.numeric(factor(paste(image_name, POINT_NO))),
    point_machine_classification = Group
  ) |>
  dplyr::select(
    project_id,
    project_name,
    site_id,
    site_name,
    site_latitude,
    site_longitude,
    site_depth,
    site_country,
    site_reef_name,
    site_reef_type,
    site_reef_zone,
    site_code,
    site_management,
    survey_id,
    survey_title,
    survey_start_date,
    survey_depth,
    survey_transect_number,
    image_id,
    image_name,
    image_quality,
    point_id,
    point_no,
    point_machine_classification
  )
  ##   PCODE = "SYNTHETIC-fixed",
  ##   ID = 1:n(),
  ##   CRUISE_CODE = paste0("SYNTHETIC",Year),
  ##   REEF_NAME = Reef,
  ##   AIMS_REEF_NAME = Reef,
  ##   SECTOR = "synthetic",
  ##   LATITUDE = Latitude,
  ##   LONGITUDE = Longitude,
  ##   SITE_NO = Site,
  ##   TRANSECT_NO = Transect,
  ##   SITE_DEPTH = Depth,
  ##   REEF_ZONE = "-",
  ##   REPORT_YEAR = Year,
  ##   SURVEY_DATE = Date,
  ##   FRAME = paste0(PCODE, "/", REEF_NAME, "/",
  ##     REEF_ZONE, "/", SITE_NO, "/", SITE_DEPTH,
  ##     "/", TRANSECT_NO, "/", REPORT_YEAR, "/", FRAME),
  ##   POINT_NO = POINT_NO,
  ##   FAMILY = NA,
  ##   GROUP_DESC = Group,
  ##   REEFPAGE_CATEGORY = paste0(Group,"_alt")
  ## ) |>
  ## dplyr::select(PCODE, ID, CRUISE_CODE, REEF_NAME,
  ##   AIMS_REEF_NAME, SECTOR,
  ##   LATITUDE, LONGITUDE, SITE_NO, TRANSECT_NO, SITE_DEPTH,
  ##   REEF_ZONE, REPORT_YEAR, SURVEY_DATE, FRAME, POINT_NO,
  ##   FAMILY, GROUP_DESC, REEFPAGE_CATEGORY)

write_csv(reef_data_synthetic_fixed,
  file = "../data/reef_data_synthetic_fixed.csv"
)
rmarkdown::paged_table(reef_data_synthetic_fixed |> head())

Random design

## Need to split the percentage cover into point and frames
load(file = "../data/data_random_locs_obs.RData")
data_random_locs_obs <- data_random_locs_obs |>
  group_by(Reef, Site, Transect, Year, Depth, Date) |>
  mutate(
    Points = round(Number_of_frames_per_transect *
      Points_per_frame * (Value / sum(Value)), 0),
    Points = ifelse(Points < 0, 0, Points)
  ) |>
  tidyr::uncount(Points) |>
  sample_n(n(), replace = FALSE) |>
  mutate(
    POINT_NO = rep_len(1:Points_per_frame, length = n()),
    ## FRAME = 1 + cumsum(POINT_NO) %/% (sum(1:Points_per_frame) + 1e-10)) |>
    FRAME = rep(1:Number_of_frames_per_transect,
      each = Points_per_frame, length = n()
    )
  ) |>
  ungroup()

reef_data_synthetic_random <-
  data_random_locs_obs |>
  mutate(
    project_id = 1,
    project_name = "synthetic_fixed",
    SITE_NO = str_replace(Site, "^S", "Site "),
    TRANSECT_NO = str_replace(Transect, "^T", "Transect "),
    site_name = factor(paste(Reef, SITE_NO)),
    site_id = as.numeric(site_name),
    site_latitude = Latitude,
    site_longitude = Longitude,
    site_depth = Depth,
    site_country = "synthetic Country",
    site_reef_name = factor(Reef),
    site_reef_type = NA,
    site_reef_zone = NA,
    site_code = NA,
    site_management = NA,
    survey_title = factor(paste(Reef, SITE_NO, TRANSECT_NO, format(Date, "%Y-%m-%d"))),
    survey_id = as.numeric(survey_title),
    survey_start_date = Date,
    survey_depth = Depth,
    survey_transect_number = as.numeric(str_replace(TRANSECT_NO, "Transect ", "")),
    image_name = factor(paste(survey_title, FRAME)),
    image_id = as.numeric(image_name),
    image_quality = 100,
    point_no = POINT_NO,
    point_id = as.numeric(factor(paste(image_name, POINT_NO))),
    point_machine_classification = Group
  ) |>
  dplyr::select(
    project_id,
    project_name,
    site_id,
    site_name,
    site_latitude,
    site_longitude,
    site_depth,
    site_country,
    site_reef_name,
    site_reef_type,
    site_reef_zone,
    site_code,
    site_management,
    survey_id,
    survey_title,
    survey_start_date,
    survey_depth,
    survey_transect_number,
    image_id,
    image_name,
    image_quality,
    point_id,
    point_no,
    point_machine_classification
  )
  ##   PCODE = "SYNTHETIC-random",
  ##   ID = 1:n(),
  ##   CRUISE_CODE = paste0("SYNTHETIC", Year),
  ##   REEF_NAME = Reef,
  ##   AIMS_REEF_NAME = Reef,
  ##   SECTOR = "synthetic",
  ##   LATITUDE = Latitude,
  ##   LONGITUDE = Longitude,
  ##   SITE_NO = Site,
  ##   TRANSECT_NO = Transect,
  ##   SITE_DEPTH = Depth,
  ##   REEF_ZONE = "-",
  ##   REPORT_YEAR = Year,
  ##   SURVEY_DATE = Date,
  ##   FRAME = paste0(PCODE, "/", REEF_NAME, "/", REEF_ZONE,
  ##     "/", SITE_NO, "/", SITE_DEPTH, "/", TRANSECT_NO,
  ##     "/", REPORT_YEAR, "/", FRAME),
  ##   POINT_NO = POINT_NO,
  ##   FAMILY = NA,
  ##   GROUP_DESC = Group,
  ##   REEFPAGE_CATEGORY = paste0(Group, "_alt")
  ## ) |>
  ## dplyr::select(
  ##   PCODE, ID, CRUISE_CODE, REEF_NAME, AIMS_REEF_NAME, SECTOR,
  ##   LATITUDE, LONGITUDE, SITE_NO, TRANSECT_NO, SITE_DEPTH,
  ##   REEF_ZONE, REPORT_YEAR, SURVEY_DATE, FRAME, POINT_NO,
  ##   FAMILY, GROUP_DESC, REEFPAGE_CATEGORY
  ## )

write_csv(reef_data_synthetic_random,
  file = "../data/reef_data_synthetic_random.csv"
  )
rmarkdown::paged_table(reef_data_synthetic_random |> head())

3 Read in the data

Lets start by reading in the data sets (which were exported as csv files). There are many functions in R that can read in a CSV file. We will use a the read_csv() function as it is part of the tidyverse ecosystem.

Fixed design
Random design

data_fixed <- read_csv("../data/reef_data_synthetic_fixed.csv", trim_ws = TRUE)

Rows: 3000027 Columns: 24
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr   (7): project_name, site_name, site_country, site_reef_name, survey_tit...
dbl  (12): project_id, site_id, site_latitude, site_longitude, site_depth, s...
lgl   (4): site_reef_type, site_reef_zone, site_code, site_management
dttm  (1): survey_start_date

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

After reading in a dataset, it is always a good idea to quickly explore a few summaries in order to ascertain whether the imported data are correctly transcribed. In particular, we should pay attention to whether there are any unexpected missing values and ensure that each variable (column) has the expected class (e.g. that variables we expected to be considered numbers are indeed listed as either or and not ).

data_fixed |> glimpse()

Rows: 3,000,027
Columns: 24
$ project_id                   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ project_name                 <chr> "synthetic_fixed", "synthetic_fixed", "sy…
$ site_id                      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ site_name                    <chr> "Reef118 Site 1", "Reef118 Site 1", "Reef…
$ site_latitude                <dbl> -20.27968, -20.27968, -20.27968, -20.2796…
$ site_longitude               <dbl> 3.959314, 3.959314, 3.959314, 3.959314, 3…
$ site_depth                   <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
$ site_country                 <chr> "synthetic Country", "synthetic Country",…
$ site_reef_name               <chr> "Reef118", "Reef118", "Reef118", "Reef118…
$ site_reef_type               <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ site_reef_zone               <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ site_code                    <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ site_management              <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ survey_id                    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ survey_title                 <chr> "Reef118 Site 1 Transect 1 2010-01-01", "…
$ survey_start_date            <dttm> 2010-01-01 04:00:00, 2010-01-01 04:00:00…
$ survey_depth                 <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
$ survey_transect_number       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ image_id                     <dbl> 1, 1, 1, 1, 1, 13, 13, 13, 13, 13, 24, 24…
$ image_name                   <chr> "Reef118 Site 1 Transect 1 2010-01-01 1",…
$ image_quality                <dbl> 100, 100, 100, 100, 100, 100, 100, 100, 1…
$ point_id                     <dbl> 1, 2, 3, 4, 5, 61, 62, 63, 64, 65, 116, 1…
$ point_no                     <dbl> 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4,…
$ point_machine_classification <chr> "MA", "MA", "HCC", "MA", "MA", "MA", "MA"…

data_fixed |> head()

# A tibble: 6 × 24
  project_id project_name    site_id site_name      site_latitude site_longitude
       <dbl> <chr>             <dbl> <chr>                  <dbl>          <dbl>
1          1 synthetic_fixed       1 Reef118 Site 1         -20.3           3.96
2          1 synthetic_fixed       1 Reef118 Site 1         -20.3           3.96
3          1 synthetic_fixed       1 Reef118 Site 1         -20.3           3.96
4          1 synthetic_fixed       1 Reef118 Site 1         -20.3           3.96
5          1 synthetic_fixed       1 Reef118 Site 1         -20.3           3.96
6          1 synthetic_fixed       1 Reef118 Site 1         -20.3           3.96
# ℹ 18 more variables: site_depth <dbl>, site_country <chr>,
#   site_reef_name <chr>, site_reef_type <lgl>, site_reef_zone <lgl>,
#   site_code <lgl>, site_management <lgl>, survey_id <dbl>,
#   survey_title <chr>, survey_start_date <dttm>, survey_depth <dbl>,
#   survey_transect_number <dbl>, image_id <dbl>, image_name <chr>,
#   image_quality <dbl>, point_id <dbl>, point_no <dbl>,
#   point_machine_classification <chr>

data_fixed |> str()

spc_tbl_ [3,000,027 × 24] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ project_id                  : num [1:3000027] 1 1 1 1 1 1 1 1 1 1 ...
 $ project_name                : chr [1:3000027] "synthetic_fixed" "synthetic_fixed" "synthetic_fixed" "synthetic_fixed" ...
 $ site_id                     : num [1:3000027] 1 1 1 1 1 1 1 1 1 1 ...
 $ site_name                   : chr [1:3000027] "Reef118 Site 1" "Reef118 Site 1" "Reef118 Site 1" "Reef118 Site 1" ...
 $ site_latitude               : num [1:3000027] -20.3 -20.3 -20.3 -20.3 -20.3 ...
 $ site_longitude              : num [1:3000027] 3.96 3.96 3.96 3.96 3.96 ...
 $ site_depth                  : num [1:3000027] 3 3 3 3 3 3 3 3 3 3 ...
 $ site_country                : chr [1:3000027] "synthetic Country" "synthetic Country" "synthetic Country" "synthetic Country" ...
 $ site_reef_name              : chr [1:3000027] "Reef118" "Reef118" "Reef118" "Reef118" ...
 $ site_reef_type              : logi [1:3000027] NA NA NA NA NA NA ...
 $ site_reef_zone              : logi [1:3000027] NA NA NA NA NA NA ...
 $ site_code                   : logi [1:3000027] NA NA NA NA NA NA ...
 $ site_management             : logi [1:3000027] NA NA NA NA NA NA ...
 $ survey_id                   : num [1:3000027] 1 1 1 1 1 1 1 1 1 1 ...
 $ survey_title                : chr [1:3000027] "Reef118 Site 1 Transect 1 2010-01-01" "Reef118 Site 1 Transect 1 2010-01-01" "Reef118 Site 1 Transect 1 2010-01-01" "Reef118 Site 1 Transect 1 2010-01-01" ...
 $ survey_start_date           : POSIXct[1:3000027], format: "2010-01-01 04:00:00" "2010-01-01 04:00:00" ...
 $ survey_depth                : num [1:3000027] 3 3 3 3 3 3 3 3 3 3 ...
 $ survey_transect_number      : num [1:3000027] 1 1 1 1 1 1 1 1 1 1 ...
 $ image_id                    : num [1:3000027] 1 1 1 1 1 13 13 13 13 13 ...
 $ image_name                  : chr [1:3000027] "Reef118 Site 1 Transect 1 2010-01-01 1" "Reef118 Site 1 Transect 1 2010-01-01 1" "Reef118 Site 1 Transect 1 2010-01-01 1" "Reef118 Site 1 Transect 1 2010-01-01 1" ...
 $ image_quality               : num [1:3000027] 100 100 100 100 100 100 100 100 100 100 ...
 $ point_id                    : num [1:3000027] 1 2 3 4 5 61 62 63 64 65 ...
 $ point_no                    : num [1:3000027] 1 2 3 4 5 1 2 3 4 5 ...
 $ point_machine_classification: chr [1:3000027] "MA" "MA" "HCC" "MA" ...
 - attr(*, "spec")=
  .. cols(
  ..   project_id = col_double(),
  ..   project_name = col_character(),
  ..   site_id = col_double(),
  ..   site_name = col_character(),
  ..   site_latitude = col_double(),
  ..   site_longitude = col_double(),
  ..   site_depth = col_double(),
  ..   site_country = col_character(),
  ..   site_reef_name = col_character(),
  ..   site_reef_type = col_logical(),
  ..   site_reef_zone = col_logical(),
  ..   site_code = col_logical(),
  ..   site_management = col_logical(),
  ..   survey_id = col_double(),
  ..   survey_title = col_character(),
  ..   survey_start_date = col_datetime(format = ""),
  ..   survey_depth = col_double(),
  ..   survey_transect_number = col_double(),
  ..   image_id = col_double(),
  ..   image_name = col_character(),
  ..   image_quality = col_double(),
  ..   point_id = col_double(),
  ..   point_no = col_double(),
  ..   point_machine_classification = col_character()
  .. )
 - attr(*, "problems")=<externalptr>

data_fixed |>
  datawizard::data_codebook() |>
  knitr::kable()

Warning: Following 4 columns were empty and have been removed:
  site_reef_type, site_reef_zone, site_code and site_management

ID	Name	Type	Missings	Values	N	Prop	.row_id
1	project_id	numeric	0 (0.0%)	1	3000027	100.0%	1
							1
2	project_name	character	0 (0.0%)	synthetic_fixed	3000027	100.0%	2
							2
3	site_id	numeric	0 (0.0%)	[1, 50]	3000027		3
							3
4	site_name	character	0 (0.0%)	Reef118 Site 1	59992	2.0%	4
				Reef118 Site 2	59999	2.0%	4
				Reef137 Site 1	60000	2.0%	4
				Reef137 Site 2	59998	2.0%	4
				Reef14 Site 1	60006	2.0%	4
				Reef14 Site 2	60002	2.0%	4
				Reef153 Site 1	60013	2.0%	4
				Reef153 Site 2	60007	2.0%	4
				Reef159 Site 1	59999	2.0%	4
				Reef159 Site 2	60004	2.0%	4
				(…)	NA	NA	4
							4
5	site_latitude	numeric	0 (0.0%)	[-20.69, -10.32]	3000027		5
							5
6	site_longitude	numeric	0 (0.0%)	[0.86, 9.04]	3000027		6
							6
7	site_depth	numeric	0 (0.0%)	3	1500009	50.0%	7
				10	1500018	50.0%	7
							7
8	site_country	character	0 (0.0%)	synthetic Country	3000027	100.0%	8
							8
9	site_reef_name	character	0 (0.0%)	Reef118	119991	4.0%	9
				Reef137	119998	4.0%	9
				Reef14	120008	4.0%	9
				Reef153	120020	4.0%	9
				Reef159	120003	4.0%	9
				Reef170	119996	4.0%	9
				Reef179	120003	4.0%	9
				Reef185	119993	4.0%	9
				Reef195	119995	4.0%	9
				Reef197	119997	4.0%	9
				(…)	NA	NA	9
							9
14	survey_id	numeric	0 (0.0%)	[1, 3000]	3000027		10
							10
15	survey_title	character	0 (0.0%)	Reef118 Site 1 Transect 1 2010-01-01	1000	0.0%	11
				Reef118 Site 1 Transect 1 2011-01-01	1000	0.0%	11
				Reef118 Site 1 Transect 1 2012-01-01	999	0.0%	11
				Reef118 Site 1 Transect 1 2013-01-01	1001	0.0%	11
				Reef118 Site 1 Transect 1 2014-01-01	1000	0.0%	11
				Reef118 Site 1 Transect 1 2015-01-01	1000	0.0%	11
				Reef118 Site 1 Transect 1 2016-01-01	1000	0.0%	11
				Reef118 Site 1 Transect 1 2017-01-01	1000	0.0%	11
				Reef118 Site 1 Transect 1 2018-01-01	1001	0.0%	11
				Reef118 Site 1 Transect 1 2019-01-01	999	0.0%	11
				(…)	NA	NA	11
							11
16	survey_start_date	numeric	0 (0.0%)	2010-01-01 04:00:00	249997	8.3%	12
				2011-01-01 04:00:00	250008	8.3%	12
				2012-01-01 04:00:00	250004	8.3%	12
				2013-01-01 04:00:00	249990	8.3%	12
				2014-01-01 04:00:00	250023	8.3%	12
				2015-01-01 04:00:00	250000	8.3%	12
				2016-01-01 04:00:00	250019	8.3%	12
				2017-01-01 04:00:00	249990	8.3%	12
				2018-01-01 04:00:00	249999	8.3%	12
				2019-01-01 04:00:00	250014	8.3%	12
				(…)	NA	NA	12
							12
17	survey_depth	numeric	0 (0.0%)	3	1500009	50.0%	13
				10	1500018	50.0%	13
							13
18	survey_transect_number	numeric	0 (0.0%)	1	600012	20.0%	14
				2	600003	20.0%	14
				3	600007	20.0%	14
				4	599997	20.0%	14
				5	600008	20.0%	14
							14
19	image_id	numeric	0 (0.0%)	[1, 300000]	3000027		15
							15
20	image_name	character	0 (0.0%)	Reef118 Site 1 Transect 1 2010-01-01 1	10	0.0%	16
				Reef118 Site 1 Transect 1 2010-01-01 10	10	0.0%	16
				Reef118 Site 1 Transect 1 2010-01-01 100	10	0.0%	16
				Reef118 Site 1 Transect 1 2010-01-01 11	10	0.0%	16
				Reef118 Site 1 Transect 1 2010-01-01 12	10	0.0%	16
				Reef118 Site 1 Transect 1 2010-01-01 13	10	0.0%	16
				Reef118 Site 1 Transect 1 2010-01-01 14	10	0.0%	16
				Reef118 Site 1 Transect 1 2010-01-01 15	10	0.0%	16
				Reef118 Site 1 Transect 1 2010-01-01 16	10	0.0%	16
				Reef118 Site 1 Transect 1 2010-01-01 17	10	0.0%	16
				(…)	NA	NA	16
							16
21	image_quality	numeric	0 (0.0%)	100	3000027	100.0%	17
							17
22	point_id	numeric	0 (0.0%)	[1, 1.49996e+06]	3000027		18
							18
23	point_no	numeric	0 (0.0%)	1	600746	20.0%	19
				2	600000	20.0%	19
				3	600000	20.0%	19
				4	600000	20.0%	19
				5	599281	20.0%	19
							19
24	point_machine_classification	character	0 (0.0%)	HCC	986690	32.9%	20
				MA	1844885	61.5%	20
				SC	168452	5.6%	20
							20

data_random <- read_csv("../data/reef_data_synthetic_random.csv", trim_ws = TRUE)

Rows: 2999943 Columns: 24
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr   (7): project_name, site_name, site_country, site_reef_name, survey_tit...
dbl  (12): project_id, site_id, site_latitude, site_longitude, site_depth, s...
lgl   (4): site_reef_type, site_reef_zone, site_code, site_management
dttm  (1): survey_start_date

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data_random |> glimpse()

Rows: 2,999,943
Columns: 24
$ project_id                   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ project_name                 <chr> "synthetic_fixed", "synthetic_fixed", "sy…
$ site_id                      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ site_name                    <chr> "Reef1 Site 1", "Reef1 Site 1", "Reef1 Si…
$ site_latitude                <dbl> -19.90968, -19.90968, -19.90968, -19.9096…
$ site_longitude               <dbl> 9.819314, 9.819314, 9.819314, 9.819314, 9…
$ site_depth                   <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
$ site_country                 <chr> "synthetic Country", "synthetic Country",…
$ site_reef_name               <chr> "Reef1", "Reef1", "Reef1", "Reef1", "Reef…
$ site_reef_type               <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ site_reef_zone               <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ site_code                    <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ site_management              <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ survey_id                    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ survey_title                 <chr> "Reef1 Site 1 Transect 1 2016-01-01", "Re…
$ survey_start_date            <dttm> 2016-01-01 04:00:00, 2016-01-01 04:00:00…
$ survey_depth                 <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
$ survey_transect_number       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ image_id                     <dbl> 1, 1, 1, 1, 1, 13, 13, 13, 13, 13, 24, 24…
$ image_name                   <chr> "Reef1 Site 1 Transect 1 2016-01-01 1", "…
$ image_quality                <dbl> 100, 100, 100, 100, 100, 100, 100, 100, 1…
$ point_id                     <dbl> 1, 2, 3, 4, 5, 61, 62, 63, 64, 65, 116, 1…
$ point_no                     <dbl> 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4,…
$ point_machine_classification <chr> "SC", "HCC", "MA", "HCC", "HCC", "HCC", "…

data_random |> head()

# A tibble: 6 × 24
  project_id project_name    site_id site_name    site_latitude site_longitude
       <dbl> <chr>             <dbl> <chr>                <dbl>          <dbl>
1          1 synthetic_fixed       1 Reef1 Site 1         -19.9           9.82
2          1 synthetic_fixed       1 Reef1 Site 1         -19.9           9.82
3          1 synthetic_fixed       1 Reef1 Site 1         -19.9           9.82
4          1 synthetic_fixed       1 Reef1 Site 1         -19.9           9.82
5          1 synthetic_fixed       1 Reef1 Site 1         -19.9           9.82
6          1 synthetic_fixed       1 Reef1 Site 1         -19.9           9.82
# ℹ 18 more variables: site_depth <dbl>, site_country <chr>,
#   site_reef_name <chr>, site_reef_type <lgl>, site_reef_zone <lgl>,
#   site_code <lgl>, site_management <lgl>, survey_id <dbl>,
#   survey_title <chr>, survey_start_date <dttm>, survey_depth <dbl>,
#   survey_transect_number <dbl>, image_id <dbl>, image_name <chr>,
#   image_quality <dbl>, point_id <dbl>, point_no <dbl>,
#   point_machine_classification <chr>

data_random |> str()

spc_tbl_ [2,999,943 × 24] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ project_id                  : num [1:2999943] 1 1 1 1 1 1 1 1 1 1 ...
 $ project_name                : chr [1:2999943] "synthetic_fixed" "synthetic_fixed" "synthetic_fixed" "synthetic_fixed" ...
 $ site_id                     : num [1:2999943] 1 1 1 1 1 1 1 1 1 1 ...
 $ site_name                   : chr [1:2999943] "Reef1 Site 1" "Reef1 Site 1" "Reef1 Site 1" "Reef1 Site 1" ...
 $ site_latitude               : num [1:2999943] -19.9 -19.9 -19.9 -19.9 -19.9 ...
 $ site_longitude              : num [1:2999943] 9.82 9.82 9.82 9.82 9.82 ...
 $ site_depth                  : num [1:2999943] 3 3 3 3 3 3 3 3 3 3 ...
 $ site_country                : chr [1:2999943] "synthetic Country" "synthetic Country" "synthetic Country" "synthetic Country" ...
 $ site_reef_name              : chr [1:2999943] "Reef1" "Reef1" "Reef1" "Reef1" ...
 $ site_reef_type              : logi [1:2999943] NA NA NA NA NA NA ...
 $ site_reef_zone              : logi [1:2999943] NA NA NA NA NA NA ...
 $ site_code                   : logi [1:2999943] NA NA NA NA NA NA ...
 $ site_management             : logi [1:2999943] NA NA NA NA NA NA ...
 $ survey_id                   : num [1:2999943] 1 1 1 1 1 1 1 1 1 1 ...
 $ survey_title                : chr [1:2999943] "Reef1 Site 1 Transect 1 2016-01-01" "Reef1 Site 1 Transect 1 2016-01-01" "Reef1 Site 1 Transect 1 2016-01-01" "Reef1 Site 1 Transect 1 2016-01-01" ...
 $ survey_start_date           : POSIXct[1:2999943], format: "2016-01-01 04:00:00" "2016-01-01 04:00:00" ...
 $ survey_depth                : num [1:2999943] 3 3 3 3 3 3 3 3 3 3 ...
 $ survey_transect_number      : num [1:2999943] 1 1 1 1 1 1 1 1 1 1 ...
 $ image_id                    : num [1:2999943] 1 1 1 1 1 13 13 13 13 13 ...
 $ image_name                  : chr [1:2999943] "Reef1 Site 1 Transect 1 2016-01-01 1" "Reef1 Site 1 Transect 1 2016-01-01 1" "Reef1 Site 1 Transect 1 2016-01-01 1" "Reef1 Site 1 Transect 1 2016-01-01 1" ...
 $ image_quality               : num [1:2999943] 100 100 100 100 100 100 100 100 100 100 ...
 $ point_id                    : num [1:2999943] 1 2 3 4 5 61 62 63 64 65 ...
 $ point_no                    : num [1:2999943] 1 2 3 4 5 1 2 3 4 5 ...
 $ point_machine_classification: chr [1:2999943] "SC" "HCC" "MA" "HCC" ...
 - attr(*, "spec")=
  .. cols(
  ..   project_id = col_double(),
  ..   project_name = col_character(),
  ..   site_id = col_double(),
  ..   site_name = col_character(),
  ..   site_latitude = col_double(),
  ..   site_longitude = col_double(),
  ..   site_depth = col_double(),
  ..   site_country = col_character(),
  ..   site_reef_name = col_character(),
  ..   site_reef_type = col_logical(),
  ..   site_reef_zone = col_logical(),
  ..   site_code = col_logical(),
  ..   site_management = col_logical(),
  ..   survey_id = col_double(),
  ..   survey_title = col_character(),
  ..   survey_start_date = col_datetime(format = ""),
  ..   survey_depth = col_double(),
  ..   survey_transect_number = col_double(),
  ..   image_id = col_double(),
  ..   image_name = col_character(),
  ..   image_quality = col_double(),
  ..   point_id = col_double(),
  ..   point_no = col_double(),
  ..   point_machine_classification = col_character()
  .. )
 - attr(*, "problems")=<externalptr>

data_random |>
  datawizard::data_codebook() |>
  knitr::kable()

Warning: Following 4 columns were empty and have been removed:
  site_reef_type, site_reef_zone, site_code and site_management

ID	Name	Type	Missings	Values	N	Prop	.row_id
1	project_id	numeric	0 (0.0%)	1	2999943	100.0%	1
							1
2	project_name	character	0 (0.0%)	synthetic_fixed	2999943	100.0%	2
							2
3	site_id	numeric	0 (0.0%)	[1, 350]	2999943		3
							3
4	site_name	character	0 (0.0%)	Reef1 Site 1	4998	0.2%	4
				Reef1 Site 2	5000	0.2%	4
				Reef10 Site 1	10000	0.3%	4
				Reef10 Site 2	10001	0.3%	4
				Reef101 Site 1	5000	0.2%	4
				Reef101 Site 2	4999	0.2%	4
				Reef102 Site 1	4999	0.2%	4
				Reef102 Site 2	5002	0.2%	4
				Reef103 Site 1	4999	0.2%	4
				Reef103 Site 2	5000	0.2%	4
				(…)	NA	NA	4
							4
5	site_latitude	numeric	0 (0.0%)	[-20.99, -10.25]	2999943		5
							5
6	site_longitude	numeric	0 (0.0%)	[0.34, 9.82]	2999943		6
							6
7	site_depth	numeric	0 (0.0%)	3	1499995	50.0%	7
				10	1499948	50.0%	7
							7
8	site_country	character	0 (0.0%)	synthetic Country	2999943	100.0%	8
							8
9	site_reef_name	character	0 (0.0%)	Reef1	9998	0.3%	9
				Reef10	20001	0.7%	9
				Reef101	9999	0.3%	9
				Reef102	10001	0.3%	9
				Reef103	9999	0.3%	9
				Reef104	10001	0.3%	9
				Reef105	10002	0.3%	9
				Reef106	20002	0.7%	9
				Reef107	30001	1.0%	9
				Reef108	9999	0.3%	9
				(…)	NA	NA	9
							9
14	survey_id	numeric	0 (0.0%)	[1, 3000]	2999943		10
							10
15	survey_title	character	0 (0.0%)	Reef1 Site 1 Transect 1 2016-01-01	1000	0.0%	11
				Reef1 Site 1 Transect 2 2016-01-01	999	0.0%	11
				Reef1 Site 1 Transect 3 2016-01-01	998	0.0%	11
				Reef1 Site 1 Transect 4 2016-01-01	1001	0.0%	11
				Reef1 Site 1 Transect 5 2016-01-01	1000	0.0%	11
				Reef1 Site 2 Transect 1 2016-01-01	1000	0.0%	11
				Reef1 Site 2 Transect 2 2016-01-01	1000	0.0%	11
				Reef1 Site 2 Transect 3 2016-01-01	1000	0.0%	11
				Reef1 Site 2 Transect 4 2016-01-01	1000	0.0%	11
				Reef1 Site 2 Transect 5 2016-01-01	1000	0.0%	11
				(…)	NA	NA	11
							11
16	survey_start_date	numeric	0 (0.0%)	2010-01-01 04:00:00	250001	8.3%	12
				2011-01-01 04:00:00	249987	8.3%	12
				2012-01-01 04:00:00	249996	8.3%	12
				2013-01-01 04:00:00	249986	8.3%	12
				2014-01-01 04:00:00	249997	8.3%	12
				2015-01-01 04:00:00	250021	8.3%	12
				2016-01-01 04:00:00	249986	8.3%	12
				2017-01-01 04:00:00	250007	8.3%	12
				2018-01-01 04:00:00	249995	8.3%	12
				2019-01-01 04:00:00	250007	8.3%	12
				(…)	NA	NA	12
							12
17	survey_depth	numeric	0 (0.0%)	3	1499995	50.0%	13
				10	1499948	50.0%	13
							13
18	survey_transect_number	numeric	0 (0.0%)	1	599986	20.0%	14
				2	599992	20.0%	14
				3	599980	20.0%	14
				4	599992	20.0%	14
				5	599993	20.0%	14
							14
19	image_id	numeric	0 (0.0%)	[1, 300000]	2999943		15
							15
20	image_name	character	0 (0.0%)	Reef1 Site 1 Transect 1 2016-01-01 1	10	0.0%	16
				Reef1 Site 1 Transect 1 2016-01-01 10	10	0.0%	16
				Reef1 Site 1 Transect 1 2016-01-01 100	10	0.0%	16
				Reef1 Site 1 Transect 1 2016-01-01 11	10	0.0%	16
				Reef1 Site 1 Transect 1 2016-01-01 12	10	0.0%	16
				Reef1 Site 1 Transect 1 2016-01-01 13	10	0.0%	16
				Reef1 Site 1 Transect 1 2016-01-01 14	10	0.0%	16
				Reef1 Site 1 Transect 1 2016-01-01 15	10	0.0%	16
				Reef1 Site 1 Transect 1 2016-01-01 16	10	0.0%	16
				Reef1 Site 1 Transect 1 2016-01-01 17	10	0.0%	16
				(…)	NA	NA	16
							16
21	image_quality	numeric	0 (0.0%)	100	2999943	100.0%	17
							17
22	point_id	numeric	0 (0.0%)	[1, 1.49994e+06]	2999943		18
							18
23	point_no	numeric	0 (0.0%)	1	600714	20.0%	19
				2	600000	20.0%	19
				3	600000	20.0%	19
				4	600000	20.0%	19
				5	599229	20.0%	19
							19
24	point_machine_classification	character	0 (0.0%)	HCC	1046658	34.9%	20
				MA	1776443	59.2%	20
				SC	176842	5.9%	20
							20

4 Excluding extraneous fields

As these are synthetic data, not all the typical reefCloud fields are present. Nevertheless, there are still a large number of fields (columns) in this dataset, many of which we are going to ignore for this exercise. The important fields are:

site_id - a unique identifier of the site
site_name - a unique name of the site
site latitude - latitude of the site
site_longitude - longitude of the site
survey_start_date - date (and time) of survey
survey_depth - depth at which the survey took place
survey_transect_number - unique identifier of the transect
image_id - unique identifier of the image
image_quality - indication of the quality of the image
point_id - unique identifier of the point
point_num - the number of the point within the image
point_machine_classification - classification determined by AI

Although it is often harmless enough to retain the other fields, it does make reviewing the data more combersum, so at an early stage within this exercise, we will probably restrict the data to just the above fields.

Fixed design
Random design

data_fixed <- data_fixed |>
  dplyr::select(site_id,
    site_name,
    site_latitude,
    site_longitude,
    survey_start_date,
    survey_depth,
    survey_transect_number,
    image_id,
    image_quality,
    point_id,
    point_no,
    point_machine_classification
    )
data_fixed |> head()

# A tibble: 6 × 12
  site_id site_name      site_latitude site_longitude survey_start_date  
    <dbl> <chr>                  <dbl>          <dbl> <dttm>             
1       1 Reef118 Site 1         -20.3           3.96 2010-01-01 04:00:00
2       1 Reef118 Site 1         -20.3           3.96 2010-01-01 04:00:00
3       1 Reef118 Site 1         -20.3           3.96 2010-01-01 04:00:00
4       1 Reef118 Site 1         -20.3           3.96 2010-01-01 04:00:00
5       1 Reef118 Site 1         -20.3           3.96 2010-01-01 04:00:00
6       1 Reef118 Site 1         -20.3           3.96 2010-01-01 04:00:00
# ℹ 7 more variables: survey_depth <dbl>, survey_transect_number <dbl>,
#   image_id <dbl>, image_quality <dbl>, point_id <dbl>, point_no <dbl>,
#   point_machine_classification <chr>

data_random <- data_random |>
  dplyr::select(site_id,
    site_name,
    site_latitude,
    site_longitude,
    survey_start_date,
    survey_depth,
    survey_transect_number,
    image_id,
    image_quality,
    point_id,
    point_no,
    point_machine_classification
    )
data_random |> head()

# A tibble: 6 × 12
  site_id site_name    site_latitude site_longitude survey_start_date  
    <dbl> <chr>                <dbl>          <dbl> <dttm>             
1       1 Reef1 Site 1         -19.9           9.82 2016-01-01 04:00:00
2       1 Reef1 Site 1         -19.9           9.82 2016-01-01 04:00:00
3       1 Reef1 Site 1         -19.9           9.82 2016-01-01 04:00:00
4       1 Reef1 Site 1         -19.9           9.82 2016-01-01 04:00:00
5       1 Reef1 Site 1         -19.9           9.82 2016-01-01 04:00:00
6       1 Reef1 Site 1         -19.9           9.82 2016-01-01 04:00:00
# ℹ 7 more variables: survey_depth <dbl>, survey_transect_number <dbl>,
#   image_id <dbl>, image_quality <dbl>, point_id <dbl>, point_no <dbl>,
#   point_machine_classification <chr>

5 Excluding poor images

Fixed design
Random design

data_fixed <-
  data_fixed |>
  dplyr::filter(is.na(image_quality) | image_quality != 0)

data_random <-
  data_random |>
  dplyr::filter(is.na(image_quality) | image_quality != 0)

6 Lengthen the data

To facilitate most graphical and statistical modelling routines, data must be structured such that each row represents an individual record and that the variables are in columns.

Fixed design
Random design

data_fixed <-
  data_fixed |>
  pivot_longer(cols = matches("point_.*_classification"),
    names_to = "type",
    values_to = "classification"
    )

data_random <-
  data_random |>
  pivot_longer(cols = matches("point_.*_classification"),
    names_to = "type",
    values_to = "classification"
    )

7 Joining to the code group lookup data

Within reefCloud, the taxonomic resolution of point classification depends on the granularity of training label sets. For many analyses (such as spatio-temporal modelling of hard coral cover), this is overly granular. In order to group the taxonomic levels up to the larger groups (such as hard coral, soft coral and macroalgae), it is necessary to join the data to a lookup table representing the labelsets. However, the current synthetic data were only constructed to the broad categories (hard coral, soft coral and macroalgae) in the first place.

Although the current data are already recorded in the desired taxonomic resolution, for code completeness, we will still join in the labelset data (which we will first generate here).

labelset <- tribble(
  ~CODE, ~DESCRIPTION, ~"FUNCTIONAL GROUP", ~"KEYBOARD SHORTCUT CODE",
  "HCC", "Hard coral", "Hard coral", NA,
  "SC", "Soft coral", "Soft coral", NA,
  "MA", "Macroalgae", "Macroalgae", NA
)
write_csv(labelset, file = "../data/labelset.csv")

labelset <- read_csv("../data/labelset.csv", trim_ws = TRUE)

Rows: 3 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): CODE, DESCRIPTION, FUNCTIONAL GROUP
lgl (1): KEYBOARD SHORTCUT CODE

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

labelset |> glimpse()

Rows: 3
Columns: 4
$ CODE                     <chr> "HCC", "SC", "MA"
$ DESCRIPTION              <chr> "Hard coral", "Soft coral", "Macroalgae"
$ `FUNCTIONAL GROUP`       <chr> "Hard coral", "Soft coral", "Macroalgae"
$ `KEYBOARD SHORTCUT CODE` <lgl> NA, NA, NA

labelset |> head()

# A tibble: 3 × 4
  CODE  DESCRIPTION `FUNCTIONAL GROUP` `KEYBOARD SHORTCUT CODE`
  <chr> <chr>       <chr>              <lgl>                   
1 HCC   Hard coral  Hard coral         NA                      
2 SC    Soft coral  Soft coral         NA                      
3 MA    Macroalgae  Macroalgae         NA

labelset |> str()

spc_tbl_ [3 × 4] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ CODE                  : chr [1:3] "HCC" "SC" "MA"
 $ DESCRIPTION           : chr [1:3] "Hard coral" "Soft coral" "Macroalgae"
 $ FUNCTIONAL GROUP      : chr [1:3] "Hard coral" "Soft coral" "Macroalgae"
 $ KEYBOARD SHORTCUT CODE: logi [1:3] NA NA NA
 - attr(*, "spec")=
  .. cols(
  ..   CODE = col_character(),
  ..   DESCRIPTION = col_character(),
  ..   `FUNCTIONAL GROUP` = col_character(),
  ..   `KEYBOARD SHORTCUT CODE` = col_logical()
  .. )
 - attr(*, "problems")=<externalptr>

labelset |>
  datawizard::data_codebook() |>
  knitr::kable()

Warning: Following 1 columns were empty and have been removed:
  KEYBOARD SHORTCUT CODE

ID	Name	Type	Missings	Values	N	Prop	.row_id
1	CODE	character	0 (0.0%)	HCC	1	33.3%	1
				MA	1	33.3%	1
				SC	1	33.3%	1
							1
2	DESCRIPTION	character	0 (0.0%)	Hard coral	1	33.3%	2
				Macroalgae	1	33.3%	2
				Soft coral	1	33.3%	2
							2
3	FUNCTIONAL GROUP	character	0 (0.0%)	Hard coral	1	33.3%	3
				Macroalgae	1	33.3%	3
				Soft coral	1	33.3%	3
							3

Fixed design
Random design

data_fixed <-
  data_fixed |>
  left_join(labelset |>
              dplyr::select(CODE, GROUP = `FUNCTIONAL GROUP`),
              by = c("classification" = "CODE")
    )
data_fixed |> as.data.frame() |> head()

  site_id      site_name site_latitude site_longitude   survey_start_date
1       1 Reef118 Site 1     -20.27968       3.959314 2010-01-01 04:00:00
2       1 Reef118 Site 1     -20.27968       3.959314 2010-01-01 04:00:00
3       1 Reef118 Site 1     -20.27968       3.959314 2010-01-01 04:00:00
4       1 Reef118 Site 1     -20.27968       3.959314 2010-01-01 04:00:00
5       1 Reef118 Site 1     -20.27968       3.959314 2010-01-01 04:00:00
6       1 Reef118 Site 1     -20.27968       3.959314 2010-01-01 04:00:00
  survey_depth survey_transect_number image_id image_quality point_id point_no
1            3                      1        1           100        1        1
2            3                      1        1           100        2        2
3            3                      1        1           100        3        3
4            3                      1        1           100        4        4
5            3                      1        1           100        5        5
6            3                      1       13           100       61        1
                          type classification      GROUP
1 point_machine_classification             MA Macroalgae
2 point_machine_classification             MA Macroalgae
3 point_machine_classification            HCC Hard coral
4 point_machine_classification             MA Macroalgae
5 point_machine_classification             MA Macroalgae
6 point_machine_classification             MA Macroalgae

data_random <-
  data_random |>
  left_join(labelset |>
              dplyr::select(CODE, GROUP = `FUNCTIONAL GROUP`),
              by = c("classification" = "CODE")
    )
data_random |> as.data.frame() |> head()

  site_id    site_name site_latitude site_longitude   survey_start_date
1       1 Reef1 Site 1     -19.90968       9.819314 2016-01-01 04:00:00
2       1 Reef1 Site 1     -19.90968       9.819314 2016-01-01 04:00:00
3       1 Reef1 Site 1     -19.90968       9.819314 2016-01-01 04:00:00
4       1 Reef1 Site 1     -19.90968       9.819314 2016-01-01 04:00:00
5       1 Reef1 Site 1     -19.90968       9.819314 2016-01-01 04:00:00
6       1 Reef1 Site 1     -19.90968       9.819314 2016-01-01 04:00:00
  survey_depth survey_transect_number image_id image_quality point_id point_no
1            3                      1        1           100        1        1
2            3                      1        1           100        2        2
3            3                      1        1           100        3        3
4            3                      1        1           100        4        4
5            3                      1        1           100        5        5
6            3                      1       13           100       61        1
                          type classification      GROUP
1 point_machine_classification             SC Soft coral
2 point_machine_classification            HCC Hard coral
3 point_machine_classification             MA Macroalgae
4 point_machine_classification            HCC Hard coral
5 point_machine_classification            HCC Hard coral
6 point_machine_classification            HCC Hard coral

8 Tally up points

Count the number of points of each type as well as sum up the total number of points per image.

Fixed design
Random design

data_fixed <- 
  data_fixed |> 
  group_by(across(c(starts_with("site"),
    starts_with("survey"),
    type,
    image_id,
    GROUP))
  ) |>
  summarise(COUNT = n(), .groups = "keep") |> 
  ungroup(GROUP) |>
  mutate(TOTAL = sum(COUNT)) |>
  ungroup()

data_random <- 
  data_random |> 
  group_by(across(c(starts_with("site"),
    starts_with("survey"),
    type,
    image_id,
    GROUP))
  ) |>
  summarise(COUNT = n(), .groups = "keep") |> 
  ungroup(GROUP) |>
  mutate(TOTAL = sum(COUNT)) |>
  ungroup()

9 Recode transects

Fixed design
Random design

data_fixed <- 
  data_fixed |>
  mutate(transect_id = paste0(site_id, survey_depth, survey_transect_number))

data_random <- 
  data_random |>
  mutate(transect_id = paste0(site_id, survey_depth, survey_transect_number))

10 Fill in any gaps

Since the data represent the classification of points in images, they only include what was present, not what was also absent. For example, if all five points are Algae, then this also means that all other functional groups are absent - yet this information is lacking in the data. For modelling purposes it is vital that we fill in all the zero values.

To do so, we must create a data set that contains every GROUP in every IMAGE.

Fixed design
Random design

GROUPS <- data_fixed |> pull(GROUP) |> unique()
data.filler <- data_fixed |> 
  dplyr::select(
    starts_with("site"),
    survey_start_date,
    survey_depth,
    transect_id,
    image_id,
    type,
    TOTAL) |> 
  distinct() |> 
 tidyr::crossing(GROUP = GROUPS) 

data_fixed <-
  data_fixed |> 
  full_join(data.filler) |>
  group_by(
    across(c(starts_with("site"),
      survey_start_date,
      survey_depth,
      transect_id,
      image_id,
      type,
      GROUP
    ))) |> 
  mutate(COUNT = ifelse(is.na(COUNT), 0, COUNT),
    TOTAL = max(TOTAL, na.rm = TRUE)
  )

Joining with `by = join_by(site_id, site_name, site_latitude, site_longitude,
survey_start_date, survey_depth, type, image_id, GROUP, TOTAL, transect_id)`

GROUPS <- data_random |> pull(GROUP) |> unique()
data.filler <- data_random |> 
  dplyr::select(
    starts_with("site"),
    survey_start_date,
    survey_depth,
    transect_id,
    image_id,
    type,
    TOTAL) |> 
  distinct() |> 
 tidyr::crossing(GROUP = GROUPS) 

data_random <-
  data_random |> 
  full_join(data.filler) |>
  group_by(
    across(c(starts_with("site"),
      survey_start_date,
      survey_depth,
      transect_id,
      image_id,
      type,
      GROUP
    ))) |> 
  mutate(COUNT = ifelse(is.na(COUNT), 0, COUNT),
    TOTAL = max(TOTAL, na.rm = TRUE)
  )

Joining with `by = join_by(site_id, site_name, site_latitude, site_longitude,
survey_start_date, survey_depth, type, image_id, GROUP, TOTAL, transect_id)`

11 Sum to transect level

Fixed design
Random design

data_fixed <- 
  data_fixed |>
  ungroup(image_id) |>
  summarise(COUNT = sum(COUNT),
    TOTAL = sum(TOTAL)
  ) |> 
  ungroup() |> 
  droplevels()

`summarise()` has grouped output by 'site_id', 'site_name', 'site_latitude',
'site_longitude', 'survey_start_date', 'survey_depth', 'transect_id', 'type'.
You can override using the `.groups` argument.

data_random <- 
  data_random |>
  ungroup(image_id) |>
  summarise(COUNT = sum(COUNT),
    TOTAL = sum(TOTAL)
  ) |> 
  ungroup() |> 
  droplevels()

`summarise()` has grouped output by 'site_id', 'site_name', 'site_latitude',
'site_longitude', 'survey_start_date', 'survey_depth', 'transect_id', 'type'.
You can override using the `.groups` argument.

12 Generate a year field

Fixed design
Random design

data_fixed <-
  data_fixed |>
  mutate(Year = lubridate::year(survey_start_date),
    TropYear = lubridate::year(survey_start_date + months(3))
  )

data_random <-
  data_random |>
  mutate(Year = lubridate::year(survey_start_date),
    TropYear = lubridate::year(survey_start_date + months(3))
  )

13 Generate a reef id

Fixed design
Random design

data_fixed <-
  data_fixed |>
  mutate(Reef_id = str_replace(site_name, "(.*) Site.*", "\\1"))

data_random <-
  data_random |>
  mutate(Reef_id = str_replace(site_name, "(.*) Site.*", "\\1"))

14 Visualisations

Fixed design
Random design

data_fixed |>
  filter(type == "point_machine_classification", GROUP == "Hard coral") |> 
  ggplot(aes(y =  COUNT/TOTAL, x = survey_start_date, colour = factor(survey_depth))) +
  geom_point() +
  geom_line(aes(group = transect_id)) + 
  facet_wrap(~Reef_id + site_name)

data_random |>
  filter(type == "point_machine_classification", GROUP == "Hard coral") |> 
  ggplot(aes(y =  COUNT/TOTAL, x = survey_start_date, colour = factor(survey_depth))) +
  geom_point()

  ## geom_line(aes(group = transect_id)) + 
  ## facet_wrap(~Reef_id + site_name)

The end