Preparation of reefCloud data
1 Synopsis
In the previous tutorial, we created synthetic reefCloud data. In the current tutorial, we will prepare these data for statistical analyses. Recall that we created two data sets, one representing a fixed sampling design, the other representing a random sampling design. We will prepare both of these data sets.
Necessary wrangling (preparation) steps:
- exclude extraneous (unneeded) fields
- exclude poor images
- lengthen the data with respect to classification type
- join to a labelset lookup
- tally up the points per date/image/GROUP/type
- recode transect id
- fill in the gaps and add the zeros
- sum to transect level
- generate a Year field from the sample date
2 Preparations
We will start by loading the required r packages.
This tutorial will prepare the data sets generated at the end of the previous tutorials. These datasets (reef_data_synthetic_fixed.csv
and reef_data_synthetic_random.csv
) are reasonably large (take up substantial disk space on a repository). In order to keep the repository containing these tutorials to a manageable size, rather than track the final, large data sets, I have instead versioned the much smaller penultimate data sets (data_fixed_locs_obs.RData
and data_random_locs_obs.RData
). Hence, before starting this tutorial, we will first read in these penultimate data sets and repeat the final preparation and writing steps of the last tutorial. If you are working through these tutorials in sequential order and already have created the final data sets, there is no need to repeat this step.
Number_of_transects_per_site <- 5
Depths <- 2
Number_of_frames_per_transect <- 100
Points_per_frame <- 5
## Note, the following are on the link scale
hcc_site_sigma <- 0.5 # variability in Sites within Locations
hcc_transect_sigma <- 0.2 # variability in Transects within Sites
hcc_sigma <- 0.1 # random noise
sc_site_sigma <- 0.05 # variability in Sites within Locations
sc_transect_sigma <- 0.02 # variability in Transects within Sites
sc_sigma <- 0.01 # random noise
ma_site_sigma <- 0.5 # variability in Sites within Locations
ma_transect_sigma <- 0.2 # variability in Transects within Sites
ma_sigma <- 0.1 # random noise
Fixed design
## Need to split the percentage cover into point and frames
load(file = "../data/data_fixed_locs_obs.RData")
data_fixed_locs_obs <- data_fixed_locs_obs |>
group_by(Reef,Site,Transect,Year,Depth,Date) |>
mutate(Points = round(Number_of_frames_per_transect *
Points_per_frame *
(Value/sum(Value)),0),
Points = ifelse(Points<0, 0, Points)) |>
tidyr::uncount(Points) |>
sample_n(n(), replace=FALSE) |>
mutate(POINT_NO = rep_len(1:Points_per_frame, length = n()),
## FRAME = 1 + cumsum(POINT_NO) %/% (sum(1:Points_per_frame) + 1e-10)) |>
FRAME = rep(1:Number_of_frames_per_transect, each=Points_per_frame, length = n())) |>
ungroup()
## a |> group_by(Reef, Site, Transect, Year, Depth, Group) |>
## summarise(Count = n()) |>
## ungroup(Group) |>
## mutate(Total=sum(Count),
## Cover = Count/Total)
reef_data_synthetic_fixed <-
data_fixed_locs_obs |>
mutate(
project_id = 1,
project_name = "synthetic_fixed",
SITE_NO = str_replace(Site, "^S", "Site "),
TRANSECT_NO = str_replace(Transect, "^T", "Transect "),
site_name = factor(paste(Reef, SITE_NO)),
site_id = as.numeric(site_name),
site_latitude = Latitude,
site_longitude = Longitude,
site_depth = Depth,
site_country = "synthetic Country",
site_reef_name = factor(Reef),
site_reef_type = NA,
site_reef_zone = NA,
site_code = NA,
site_management = NA,
survey_title = factor(paste(Reef, SITE_NO, TRANSECT_NO, format(Date, "%Y-%m-%d"))),
survey_id = as.numeric(survey_title),
survey_start_date = Date,
survey_depth = Depth,
survey_transect_number = as.numeric(str_replace(TRANSECT_NO, "Transect ", "")),
image_name = factor(paste(survey_title, FRAME)),
image_id = as.numeric(image_name),
image_quality = 100,
point_no = POINT_NO,
point_id = as.numeric(factor(paste(image_name, POINT_NO))),
point_machine_classification = Group
) |>
dplyr::select(
project_id,
project_name,
site_id,
site_name,
site_latitude,
site_longitude,
site_depth,
site_country,
site_reef_name,
site_reef_type,
site_reef_zone,
site_code,
site_management,
survey_id,
survey_title,
survey_start_date,
survey_depth,
survey_transect_number,
image_id,
image_name,
image_quality,
point_id,
point_no,
point_machine_classification
)
## PCODE = "SYNTHETIC-fixed",
## ID = 1:n(),
## CRUISE_CODE = paste0("SYNTHETIC",Year),
## REEF_NAME = Reef,
## AIMS_REEF_NAME = Reef,
## SECTOR = "synthetic",
## LATITUDE = Latitude,
## LONGITUDE = Longitude,
## SITE_NO = Site,
## TRANSECT_NO = Transect,
## SITE_DEPTH = Depth,
## REEF_ZONE = "-",
## REPORT_YEAR = Year,
## SURVEY_DATE = Date,
## FRAME = paste0(PCODE, "/", REEF_NAME, "/",
## REEF_ZONE, "/", SITE_NO, "/", SITE_DEPTH,
## "/", TRANSECT_NO, "/", REPORT_YEAR, "/", FRAME),
## POINT_NO = POINT_NO,
## FAMILY = NA,
## GROUP_DESC = Group,
## REEFPAGE_CATEGORY = paste0(Group,"_alt")
## ) |>
## dplyr::select(PCODE, ID, CRUISE_CODE, REEF_NAME,
## AIMS_REEF_NAME, SECTOR,
## LATITUDE, LONGITUDE, SITE_NO, TRANSECT_NO, SITE_DEPTH,
## REEF_ZONE, REPORT_YEAR, SURVEY_DATE, FRAME, POINT_NO,
## FAMILY, GROUP_DESC, REEFPAGE_CATEGORY)
write_csv(reef_data_synthetic_fixed,
file = "../data/reef_data_synthetic_fixed.csv"
)
rmarkdown::paged_table(reef_data_synthetic_fixed |> head())
Random design
## Need to split the percentage cover into point and frames
load(file = "../data/data_random_locs_obs.RData")
data_random_locs_obs <- data_random_locs_obs |>
group_by(Reef, Site, Transect, Year, Depth, Date) |>
mutate(
Points = round(Number_of_frames_per_transect *
Points_per_frame * (Value / sum(Value)), 0),
Points = ifelse(Points < 0, 0, Points)
) |>
tidyr::uncount(Points) |>
sample_n(n(), replace = FALSE) |>
mutate(
POINT_NO = rep_len(1:Points_per_frame, length = n()),
## FRAME = 1 + cumsum(POINT_NO) %/% (sum(1:Points_per_frame) + 1e-10)) |>
FRAME = rep(1:Number_of_frames_per_transect,
each = Points_per_frame, length = n()
)
) |>
ungroup()
reef_data_synthetic_random <-
data_random_locs_obs |>
mutate(
project_id = 1,
project_name = "synthetic_fixed",
SITE_NO = str_replace(Site, "^S", "Site "),
TRANSECT_NO = str_replace(Transect, "^T", "Transect "),
site_name = factor(paste(Reef, SITE_NO)),
site_id = as.numeric(site_name),
site_latitude = Latitude,
site_longitude = Longitude,
site_depth = Depth,
site_country = "synthetic Country",
site_reef_name = factor(Reef),
site_reef_type = NA,
site_reef_zone = NA,
site_code = NA,
site_management = NA,
survey_title = factor(paste(Reef, SITE_NO, TRANSECT_NO, format(Date, "%Y-%m-%d"))),
survey_id = as.numeric(survey_title),
survey_start_date = Date,
survey_depth = Depth,
survey_transect_number = as.numeric(str_replace(TRANSECT_NO, "Transect ", "")),
image_name = factor(paste(survey_title, FRAME)),
image_id = as.numeric(image_name),
image_quality = 100,
point_no = POINT_NO,
point_id = as.numeric(factor(paste(image_name, POINT_NO))),
point_machine_classification = Group
) |>
dplyr::select(
project_id,
project_name,
site_id,
site_name,
site_latitude,
site_longitude,
site_depth,
site_country,
site_reef_name,
site_reef_type,
site_reef_zone,
site_code,
site_management,
survey_id,
survey_title,
survey_start_date,
survey_depth,
survey_transect_number,
image_id,
image_name,
image_quality,
point_id,
point_no,
point_machine_classification
)
## PCODE = "SYNTHETIC-random",
## ID = 1:n(),
## CRUISE_CODE = paste0("SYNTHETIC", Year),
## REEF_NAME = Reef,
## AIMS_REEF_NAME = Reef,
## SECTOR = "synthetic",
## LATITUDE = Latitude,
## LONGITUDE = Longitude,
## SITE_NO = Site,
## TRANSECT_NO = Transect,
## SITE_DEPTH = Depth,
## REEF_ZONE = "-",
## REPORT_YEAR = Year,
## SURVEY_DATE = Date,
## FRAME = paste0(PCODE, "/", REEF_NAME, "/", REEF_ZONE,
## "/", SITE_NO, "/", SITE_DEPTH, "/", TRANSECT_NO,
## "/", REPORT_YEAR, "/", FRAME),
## POINT_NO = POINT_NO,
## FAMILY = NA,
## GROUP_DESC = Group,
## REEFPAGE_CATEGORY = paste0(Group, "_alt")
## ) |>
## dplyr::select(
## PCODE, ID, CRUISE_CODE, REEF_NAME, AIMS_REEF_NAME, SECTOR,
## LATITUDE, LONGITUDE, SITE_NO, TRANSECT_NO, SITE_DEPTH,
## REEF_ZONE, REPORT_YEAR, SURVEY_DATE, FRAME, POINT_NO,
## FAMILY, GROUP_DESC, REEFPAGE_CATEGORY
## )
write_csv(reef_data_synthetic_random,
file = "../data/reef_data_synthetic_random.csv"
)
rmarkdown::paged_table(reef_data_synthetic_random |> head())
3 Read in the data
Lets start by reading in the data sets (which were exported as csv files). There are many functions in R that can read in a CSV file. We will use a the read_csv()
function as it is part of the tidyverse ecosystem.
Rows: 3000027 Columns: 24
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (7): project_name, site_name, site_country, site_reef_name, survey_tit...
dbl (12): project_id, site_id, site_latitude, site_longitude, site_depth, s...
lgl (4): site_reef_type, site_reef_zone, site_code, site_management
dttm (1): survey_start_date
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
After reading in a dataset, it is always a good idea to quickly explore a few summaries in order to ascertain whether the imported data are correctly transcribed. In particular, we should pay attention to whether there are any unexpected missing values and ensure that each variable (column) has the expected class (e.g. that variables we expected to be considered numbers are indeed listed as either
Rows: 3,000,027
Columns: 24
$ project_id <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ project_name <chr> "synthetic_fixed", "synthetic_fixed", "sy…
$ site_id <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ site_name <chr> "Reef118 Site 1", "Reef118 Site 1", "Reef…
$ site_latitude <dbl> -20.27968, -20.27968, -20.27968, -20.2796…
$ site_longitude <dbl> 3.959314, 3.959314, 3.959314, 3.959314, 3…
$ site_depth <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
$ site_country <chr> "synthetic Country", "synthetic Country",…
$ site_reef_name <chr> "Reef118", "Reef118", "Reef118", "Reef118…
$ site_reef_type <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ site_reef_zone <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ site_code <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ site_management <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ survey_id <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ survey_title <chr> "Reef118 Site 1 Transect 1 2010-01-01", "…
$ survey_start_date <dttm> 2010-01-01 04:00:00, 2010-01-01 04:00:00…
$ survey_depth <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
$ survey_transect_number <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ image_id <dbl> 1, 1, 1, 1, 1, 13, 13, 13, 13, 13, 24, 24…
$ image_name <chr> "Reef118 Site 1 Transect 1 2010-01-01 1",…
$ image_quality <dbl> 100, 100, 100, 100, 100, 100, 100, 100, 1…
$ point_id <dbl> 1, 2, 3, 4, 5, 61, 62, 63, 64, 65, 116, 1…
$ point_no <dbl> 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4,…
$ point_machine_classification <chr> "MA", "MA", "HCC", "MA", "MA", "MA", "MA"…
# A tibble: 6 × 24
project_id project_name site_id site_name site_latitude site_longitude
<dbl> <chr> <dbl> <chr> <dbl> <dbl>
1 1 synthetic_fixed 1 Reef118 Site 1 -20.3 3.96
2 1 synthetic_fixed 1 Reef118 Site 1 -20.3 3.96
3 1 synthetic_fixed 1 Reef118 Site 1 -20.3 3.96
4 1 synthetic_fixed 1 Reef118 Site 1 -20.3 3.96
5 1 synthetic_fixed 1 Reef118 Site 1 -20.3 3.96
6 1 synthetic_fixed 1 Reef118 Site 1 -20.3 3.96
# ℹ 18 more variables: site_depth <dbl>, site_country <chr>,
# site_reef_name <chr>, site_reef_type <lgl>, site_reef_zone <lgl>,
# site_code <lgl>, site_management <lgl>, survey_id <dbl>,
# survey_title <chr>, survey_start_date <dttm>, survey_depth <dbl>,
# survey_transect_number <dbl>, image_id <dbl>, image_name <chr>,
# image_quality <dbl>, point_id <dbl>, point_no <dbl>,
# point_machine_classification <chr>
spc_tbl_ [3,000,027 × 24] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ project_id : num [1:3000027] 1 1 1 1 1 1 1 1 1 1 ...
$ project_name : chr [1:3000027] "synthetic_fixed" "synthetic_fixed" "synthetic_fixed" "synthetic_fixed" ...
$ site_id : num [1:3000027] 1 1 1 1 1 1 1 1 1 1 ...
$ site_name : chr [1:3000027] "Reef118 Site 1" "Reef118 Site 1" "Reef118 Site 1" "Reef118 Site 1" ...
$ site_latitude : num [1:3000027] -20.3 -20.3 -20.3 -20.3 -20.3 ...
$ site_longitude : num [1:3000027] 3.96 3.96 3.96 3.96 3.96 ...
$ site_depth : num [1:3000027] 3 3 3 3 3 3 3 3 3 3 ...
$ site_country : chr [1:3000027] "synthetic Country" "synthetic Country" "synthetic Country" "synthetic Country" ...
$ site_reef_name : chr [1:3000027] "Reef118" "Reef118" "Reef118" "Reef118" ...
$ site_reef_type : logi [1:3000027] NA NA NA NA NA NA ...
$ site_reef_zone : logi [1:3000027] NA NA NA NA NA NA ...
$ site_code : logi [1:3000027] NA NA NA NA NA NA ...
$ site_management : logi [1:3000027] NA NA NA NA NA NA ...
$ survey_id : num [1:3000027] 1 1 1 1 1 1 1 1 1 1 ...
$ survey_title : chr [1:3000027] "Reef118 Site 1 Transect 1 2010-01-01" "Reef118 Site 1 Transect 1 2010-01-01" "Reef118 Site 1 Transect 1 2010-01-01" "Reef118 Site 1 Transect 1 2010-01-01" ...
$ survey_start_date : POSIXct[1:3000027], format: "2010-01-01 04:00:00" "2010-01-01 04:00:00" ...
$ survey_depth : num [1:3000027] 3 3 3 3 3 3 3 3 3 3 ...
$ survey_transect_number : num [1:3000027] 1 1 1 1 1 1 1 1 1 1 ...
$ image_id : num [1:3000027] 1 1 1 1 1 13 13 13 13 13 ...
$ image_name : chr [1:3000027] "Reef118 Site 1 Transect 1 2010-01-01 1" "Reef118 Site 1 Transect 1 2010-01-01 1" "Reef118 Site 1 Transect 1 2010-01-01 1" "Reef118 Site 1 Transect 1 2010-01-01 1" ...
$ image_quality : num [1:3000027] 100 100 100 100 100 100 100 100 100 100 ...
$ point_id : num [1:3000027] 1 2 3 4 5 61 62 63 64 65 ...
$ point_no : num [1:3000027] 1 2 3 4 5 1 2 3 4 5 ...
$ point_machine_classification: chr [1:3000027] "MA" "MA" "HCC" "MA" ...
- attr(*, "spec")=
.. cols(
.. project_id = col_double(),
.. project_name = col_character(),
.. site_id = col_double(),
.. site_name = col_character(),
.. site_latitude = col_double(),
.. site_longitude = col_double(),
.. site_depth = col_double(),
.. site_country = col_character(),
.. site_reef_name = col_character(),
.. site_reef_type = col_logical(),
.. site_reef_zone = col_logical(),
.. site_code = col_logical(),
.. site_management = col_logical(),
.. survey_id = col_double(),
.. survey_title = col_character(),
.. survey_start_date = col_datetime(format = ""),
.. survey_depth = col_double(),
.. survey_transect_number = col_double(),
.. image_id = col_double(),
.. image_name = col_character(),
.. image_quality = col_double(),
.. point_id = col_double(),
.. point_no = col_double(),
.. point_machine_classification = col_character()
.. )
- attr(*, "problems")=<externalptr>
Warning: Following 4 columns were empty and have been removed:
site_reef_type, site_reef_zone, site_code and site_management
ID | Name | Type | Missings | Values | N | Prop | .row_id |
---|---|---|---|---|---|---|---|
1 | project_id | numeric | 0 (0.0%) | 1 | 3000027 | 100.0% | 1 |
1 | |||||||
2 | project_name | character | 0 (0.0%) | synthetic_fixed | 3000027 | 100.0% | 2 |
2 | |||||||
3 | site_id | numeric | 0 (0.0%) | [1, 50] | 3000027 | 3 | |
3 | |||||||
4 | site_name | character | 0 (0.0%) | Reef118 Site 1 | 59992 | 2.0% | 4 |
Reef118 Site 2 | 59999 | 2.0% | 4 | ||||
Reef137 Site 1 | 60000 | 2.0% | 4 | ||||
Reef137 Site 2 | 59998 | 2.0% | 4 | ||||
Reef14 Site 1 | 60006 | 2.0% | 4 | ||||
Reef14 Site 2 | 60002 | 2.0% | 4 | ||||
Reef153 Site 1 | 60013 | 2.0% | 4 | ||||
Reef153 Site 2 | 60007 | 2.0% | 4 | ||||
Reef159 Site 1 | 59999 | 2.0% | 4 | ||||
Reef159 Site 2 | 60004 | 2.0% | 4 | ||||
(…) | NA | NA | 4 | ||||
4 | |||||||
5 | site_latitude | numeric | 0 (0.0%) | [-20.69, -10.32] | 3000027 | 5 | |
5 | |||||||
6 | site_longitude | numeric | 0 (0.0%) | [0.86, 9.04] | 3000027 | 6 | |
6 | |||||||
7 | site_depth | numeric | 0 (0.0%) | 3 | 1500009 | 50.0% | 7 |
10 | 1500018 | 50.0% | 7 | ||||
7 | |||||||
8 | site_country | character | 0 (0.0%) | synthetic Country | 3000027 | 100.0% | 8 |
8 | |||||||
9 | site_reef_name | character | 0 (0.0%) | Reef118 | 119991 | 4.0% | 9 |
Reef137 | 119998 | 4.0% | 9 | ||||
Reef14 | 120008 | 4.0% | 9 | ||||
Reef153 | 120020 | 4.0% | 9 | ||||
Reef159 | 120003 | 4.0% | 9 | ||||
Reef170 | 119996 | 4.0% | 9 | ||||
Reef179 | 120003 | 4.0% | 9 | ||||
Reef185 | 119993 | 4.0% | 9 | ||||
Reef195 | 119995 | 4.0% | 9 | ||||
Reef197 | 119997 | 4.0% | 9 | ||||
(…) | NA | NA | 9 | ||||
9 | |||||||
14 | survey_id | numeric | 0 (0.0%) | [1, 3000] | 3000027 | 10 | |
10 | |||||||
15 | survey_title | character | 0 (0.0%) | Reef118 Site 1 Transect 1 2010-01-01 | 1000 | 0.0% | 11 |
Reef118 Site 1 Transect 1 2011-01-01 | 1000 | 0.0% | 11 | ||||
Reef118 Site 1 Transect 1 2012-01-01 | 999 | 0.0% | 11 | ||||
Reef118 Site 1 Transect 1 2013-01-01 | 1001 | 0.0% | 11 | ||||
Reef118 Site 1 Transect 1 2014-01-01 | 1000 | 0.0% | 11 | ||||
Reef118 Site 1 Transect 1 2015-01-01 | 1000 | 0.0% | 11 | ||||
Reef118 Site 1 Transect 1 2016-01-01 | 1000 | 0.0% | 11 | ||||
Reef118 Site 1 Transect 1 2017-01-01 | 1000 | 0.0% | 11 | ||||
Reef118 Site 1 Transect 1 2018-01-01 | 1001 | 0.0% | 11 | ||||
Reef118 Site 1 Transect 1 2019-01-01 | 999 | 0.0% | 11 | ||||
(…) | NA | NA | 11 | ||||
11 | |||||||
16 | survey_start_date | numeric | 0 (0.0%) | 2010-01-01 04:00:00 | 249997 | 8.3% | 12 |
2011-01-01 04:00:00 | 250008 | 8.3% | 12 | ||||
2012-01-01 04:00:00 | 250004 | 8.3% | 12 | ||||
2013-01-01 04:00:00 | 249990 | 8.3% | 12 | ||||
2014-01-01 04:00:00 | 250023 | 8.3% | 12 | ||||
2015-01-01 04:00:00 | 250000 | 8.3% | 12 | ||||
2016-01-01 04:00:00 | 250019 | 8.3% | 12 | ||||
2017-01-01 04:00:00 | 249990 | 8.3% | 12 | ||||
2018-01-01 04:00:00 | 249999 | 8.3% | 12 | ||||
2019-01-01 04:00:00 | 250014 | 8.3% | 12 | ||||
(…) | NA | NA | 12 | ||||
12 | |||||||
17 | survey_depth | numeric | 0 (0.0%) | 3 | 1500009 | 50.0% | 13 |
10 | 1500018 | 50.0% | 13 | ||||
13 | |||||||
18 | survey_transect_number | numeric | 0 (0.0%) | 1 | 600012 | 20.0% | 14 |
2 | 600003 | 20.0% | 14 | ||||
3 | 600007 | 20.0% | 14 | ||||
4 | 599997 | 20.0% | 14 | ||||
5 | 600008 | 20.0% | 14 | ||||
14 | |||||||
19 | image_id | numeric | 0 (0.0%) | [1, 300000] | 3000027 | 15 | |
15 | |||||||
20 | image_name | character | 0 (0.0%) | Reef118 Site 1 Transect 1 2010-01-01 1 | 10 | 0.0% | 16 |
Reef118 Site 1 Transect 1 2010-01-01 10 | 10 | 0.0% | 16 | ||||
Reef118 Site 1 Transect 1 2010-01-01 100 | 10 | 0.0% | 16 | ||||
Reef118 Site 1 Transect 1 2010-01-01 11 | 10 | 0.0% | 16 | ||||
Reef118 Site 1 Transect 1 2010-01-01 12 | 10 | 0.0% | 16 | ||||
Reef118 Site 1 Transect 1 2010-01-01 13 | 10 | 0.0% | 16 | ||||
Reef118 Site 1 Transect 1 2010-01-01 14 | 10 | 0.0% | 16 | ||||
Reef118 Site 1 Transect 1 2010-01-01 15 | 10 | 0.0% | 16 | ||||
Reef118 Site 1 Transect 1 2010-01-01 16 | 10 | 0.0% | 16 | ||||
Reef118 Site 1 Transect 1 2010-01-01 17 | 10 | 0.0% | 16 | ||||
(…) | NA | NA | 16 | ||||
16 | |||||||
21 | image_quality | numeric | 0 (0.0%) | 100 | 3000027 | 100.0% | 17 |
17 | |||||||
22 | point_id | numeric | 0 (0.0%) | [1, 1.49996e+06] | 3000027 | 18 | |
18 | |||||||
23 | point_no | numeric | 0 (0.0%) | 1 | 600746 | 20.0% | 19 |
2 | 600000 | 20.0% | 19 | ||||
3 | 600000 | 20.0% | 19 | ||||
4 | 600000 | 20.0% | 19 | ||||
5 | 599281 | 20.0% | 19 | ||||
19 | |||||||
24 | point_machine_classification | character | 0 (0.0%) | HCC | 986690 | 32.9% | 20 |
MA | 1844885 | 61.5% | 20 | ||||
SC | 168452 | 5.6% | 20 | ||||
20 |
Rows: 2999943 Columns: 24
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (7): project_name, site_name, site_country, site_reef_name, survey_tit...
dbl (12): project_id, site_id, site_latitude, site_longitude, site_depth, s...
lgl (4): site_reef_type, site_reef_zone, site_code, site_management
dttm (1): survey_start_date
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
After reading in a dataset, it is always a good idea to quickly explore a few summaries in order to ascertain whether the imported data are correctly transcribed. In particular, we should pay attention to whether there are any unexpected missing values and ensure that each variable (column) has the expected class (e.g. that variables we expected to be considered numbers are indeed listed as either
Rows: 2,999,943
Columns: 24
$ project_id <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ project_name <chr> "synthetic_fixed", "synthetic_fixed", "sy…
$ site_id <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ site_name <chr> "Reef1 Site 1", "Reef1 Site 1", "Reef1 Si…
$ site_latitude <dbl> -19.90968, -19.90968, -19.90968, -19.9096…
$ site_longitude <dbl> 9.819314, 9.819314, 9.819314, 9.819314, 9…
$ site_depth <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
$ site_country <chr> "synthetic Country", "synthetic Country",…
$ site_reef_name <chr> "Reef1", "Reef1", "Reef1", "Reef1", "Reef…
$ site_reef_type <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ site_reef_zone <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ site_code <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ site_management <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ survey_id <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ survey_title <chr> "Reef1 Site 1 Transect 1 2016-01-01", "Re…
$ survey_start_date <dttm> 2016-01-01 04:00:00, 2016-01-01 04:00:00…
$ survey_depth <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
$ survey_transect_number <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ image_id <dbl> 1, 1, 1, 1, 1, 13, 13, 13, 13, 13, 24, 24…
$ image_name <chr> "Reef1 Site 1 Transect 1 2016-01-01 1", "…
$ image_quality <dbl> 100, 100, 100, 100, 100, 100, 100, 100, 1…
$ point_id <dbl> 1, 2, 3, 4, 5, 61, 62, 63, 64, 65, 116, 1…
$ point_no <dbl> 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4,…
$ point_machine_classification <chr> "SC", "HCC", "MA", "HCC", "HCC", "HCC", "…
# A tibble: 6 × 24
project_id project_name site_id site_name site_latitude site_longitude
<dbl> <chr> <dbl> <chr> <dbl> <dbl>
1 1 synthetic_fixed 1 Reef1 Site 1 -19.9 9.82
2 1 synthetic_fixed 1 Reef1 Site 1 -19.9 9.82
3 1 synthetic_fixed 1 Reef1 Site 1 -19.9 9.82
4 1 synthetic_fixed 1 Reef1 Site 1 -19.9 9.82
5 1 synthetic_fixed 1 Reef1 Site 1 -19.9 9.82
6 1 synthetic_fixed 1 Reef1 Site 1 -19.9 9.82
# ℹ 18 more variables: site_depth <dbl>, site_country <chr>,
# site_reef_name <chr>, site_reef_type <lgl>, site_reef_zone <lgl>,
# site_code <lgl>, site_management <lgl>, survey_id <dbl>,
# survey_title <chr>, survey_start_date <dttm>, survey_depth <dbl>,
# survey_transect_number <dbl>, image_id <dbl>, image_name <chr>,
# image_quality <dbl>, point_id <dbl>, point_no <dbl>,
# point_machine_classification <chr>
spc_tbl_ [2,999,943 × 24] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ project_id : num [1:2999943] 1 1 1 1 1 1 1 1 1 1 ...
$ project_name : chr [1:2999943] "synthetic_fixed" "synthetic_fixed" "synthetic_fixed" "synthetic_fixed" ...
$ site_id : num [1:2999943] 1 1 1 1 1 1 1 1 1 1 ...
$ site_name : chr [1:2999943] "Reef1 Site 1" "Reef1 Site 1" "Reef1 Site 1" "Reef1 Site 1" ...
$ site_latitude : num [1:2999943] -19.9 -19.9 -19.9 -19.9 -19.9 ...
$ site_longitude : num [1:2999943] 9.82 9.82 9.82 9.82 9.82 ...
$ site_depth : num [1:2999943] 3 3 3 3 3 3 3 3 3 3 ...
$ site_country : chr [1:2999943] "synthetic Country" "synthetic Country" "synthetic Country" "synthetic Country" ...
$ site_reef_name : chr [1:2999943] "Reef1" "Reef1" "Reef1" "Reef1" ...
$ site_reef_type : logi [1:2999943] NA NA NA NA NA NA ...
$ site_reef_zone : logi [1:2999943] NA NA NA NA NA NA ...
$ site_code : logi [1:2999943] NA NA NA NA NA NA ...
$ site_management : logi [1:2999943] NA NA NA NA NA NA ...
$ survey_id : num [1:2999943] 1 1 1 1 1 1 1 1 1 1 ...
$ survey_title : chr [1:2999943] "Reef1 Site 1 Transect 1 2016-01-01" "Reef1 Site 1 Transect 1 2016-01-01" "Reef1 Site 1 Transect 1 2016-01-01" "Reef1 Site 1 Transect 1 2016-01-01" ...
$ survey_start_date : POSIXct[1:2999943], format: "2016-01-01 04:00:00" "2016-01-01 04:00:00" ...
$ survey_depth : num [1:2999943] 3 3 3 3 3 3 3 3 3 3 ...
$ survey_transect_number : num [1:2999943] 1 1 1 1 1 1 1 1 1 1 ...
$ image_id : num [1:2999943] 1 1 1 1 1 13 13 13 13 13 ...
$ image_name : chr [1:2999943] "Reef1 Site 1 Transect 1 2016-01-01 1" "Reef1 Site 1 Transect 1 2016-01-01 1" "Reef1 Site 1 Transect 1 2016-01-01 1" "Reef1 Site 1 Transect 1 2016-01-01 1" ...
$ image_quality : num [1:2999943] 100 100 100 100 100 100 100 100 100 100 ...
$ point_id : num [1:2999943] 1 2 3 4 5 61 62 63 64 65 ...
$ point_no : num [1:2999943] 1 2 3 4 5 1 2 3 4 5 ...
$ point_machine_classification: chr [1:2999943] "SC" "HCC" "MA" "HCC" ...
- attr(*, "spec")=
.. cols(
.. project_id = col_double(),
.. project_name = col_character(),
.. site_id = col_double(),
.. site_name = col_character(),
.. site_latitude = col_double(),
.. site_longitude = col_double(),
.. site_depth = col_double(),
.. site_country = col_character(),
.. site_reef_name = col_character(),
.. site_reef_type = col_logical(),
.. site_reef_zone = col_logical(),
.. site_code = col_logical(),
.. site_management = col_logical(),
.. survey_id = col_double(),
.. survey_title = col_character(),
.. survey_start_date = col_datetime(format = ""),
.. survey_depth = col_double(),
.. survey_transect_number = col_double(),
.. image_id = col_double(),
.. image_name = col_character(),
.. image_quality = col_double(),
.. point_id = col_double(),
.. point_no = col_double(),
.. point_machine_classification = col_character()
.. )
- attr(*, "problems")=<externalptr>
Warning: Following 4 columns were empty and have been removed:
site_reef_type, site_reef_zone, site_code and site_management
ID | Name | Type | Missings | Values | N | Prop | .row_id |
---|---|---|---|---|---|---|---|
1 | project_id | numeric | 0 (0.0%) | 1 | 2999943 | 100.0% | 1 |
1 | |||||||
2 | project_name | character | 0 (0.0%) | synthetic_fixed | 2999943 | 100.0% | 2 |
2 | |||||||
3 | site_id | numeric | 0 (0.0%) | [1, 350] | 2999943 | 3 | |
3 | |||||||
4 | site_name | character | 0 (0.0%) | Reef1 Site 1 | 4998 | 0.2% | 4 |
Reef1 Site 2 | 5000 | 0.2% | 4 | ||||
Reef10 Site 1 | 10000 | 0.3% | 4 | ||||
Reef10 Site 2 | 10001 | 0.3% | 4 | ||||
Reef101 Site 1 | 5000 | 0.2% | 4 | ||||
Reef101 Site 2 | 4999 | 0.2% | 4 | ||||
Reef102 Site 1 | 4999 | 0.2% | 4 | ||||
Reef102 Site 2 | 5002 | 0.2% | 4 | ||||
Reef103 Site 1 | 4999 | 0.2% | 4 | ||||
Reef103 Site 2 | 5000 | 0.2% | 4 | ||||
(…) | NA | NA | 4 | ||||
4 | |||||||
5 | site_latitude | numeric | 0 (0.0%) | [-20.99, -10.25] | 2999943 | 5 | |
5 | |||||||
6 | site_longitude | numeric | 0 (0.0%) | [0.34, 9.82] | 2999943 | 6 | |
6 | |||||||
7 | site_depth | numeric | 0 (0.0%) | 3 | 1499995 | 50.0% | 7 |
10 | 1499948 | 50.0% | 7 | ||||
7 | |||||||
8 | site_country | character | 0 (0.0%) | synthetic Country | 2999943 | 100.0% | 8 |
8 | |||||||
9 | site_reef_name | character | 0 (0.0%) | Reef1 | 9998 | 0.3% | 9 |
Reef10 | 20001 | 0.7% | 9 | ||||
Reef101 | 9999 | 0.3% | 9 | ||||
Reef102 | 10001 | 0.3% | 9 | ||||
Reef103 | 9999 | 0.3% | 9 | ||||
Reef104 | 10001 | 0.3% | 9 | ||||
Reef105 | 10002 | 0.3% | 9 | ||||
Reef106 | 20002 | 0.7% | 9 | ||||
Reef107 | 30001 | 1.0% | 9 | ||||
Reef108 | 9999 | 0.3% | 9 | ||||
(…) | NA | NA | 9 | ||||
9 | |||||||
14 | survey_id | numeric | 0 (0.0%) | [1, 3000] | 2999943 | 10 | |
10 | |||||||
15 | survey_title | character | 0 (0.0%) | Reef1 Site 1 Transect 1 2016-01-01 | 1000 | 0.0% | 11 |
Reef1 Site 1 Transect 2 2016-01-01 | 999 | 0.0% | 11 | ||||
Reef1 Site 1 Transect 3 2016-01-01 | 998 | 0.0% | 11 | ||||
Reef1 Site 1 Transect 4 2016-01-01 | 1001 | 0.0% | 11 | ||||
Reef1 Site 1 Transect 5 2016-01-01 | 1000 | 0.0% | 11 | ||||
Reef1 Site 2 Transect 1 2016-01-01 | 1000 | 0.0% | 11 | ||||
Reef1 Site 2 Transect 2 2016-01-01 | 1000 | 0.0% | 11 | ||||
Reef1 Site 2 Transect 3 2016-01-01 | 1000 | 0.0% | 11 | ||||
Reef1 Site 2 Transect 4 2016-01-01 | 1000 | 0.0% | 11 | ||||
Reef1 Site 2 Transect 5 2016-01-01 | 1000 | 0.0% | 11 | ||||
(…) | NA | NA | 11 | ||||
11 | |||||||
16 | survey_start_date | numeric | 0 (0.0%) | 2010-01-01 04:00:00 | 250001 | 8.3% | 12 |
2011-01-01 04:00:00 | 249987 | 8.3% | 12 | ||||
2012-01-01 04:00:00 | 249996 | 8.3% | 12 | ||||
2013-01-01 04:00:00 | 249986 | 8.3% | 12 | ||||
2014-01-01 04:00:00 | 249997 | 8.3% | 12 | ||||
2015-01-01 04:00:00 | 250021 | 8.3% | 12 | ||||
2016-01-01 04:00:00 | 249986 | 8.3% | 12 | ||||
2017-01-01 04:00:00 | 250007 | 8.3% | 12 | ||||
2018-01-01 04:00:00 | 249995 | 8.3% | 12 | ||||
2019-01-01 04:00:00 | 250007 | 8.3% | 12 | ||||
(…) | NA | NA | 12 | ||||
12 | |||||||
17 | survey_depth | numeric | 0 (0.0%) | 3 | 1499995 | 50.0% | 13 |
10 | 1499948 | 50.0% | 13 | ||||
13 | |||||||
18 | survey_transect_number | numeric | 0 (0.0%) | 1 | 599986 | 20.0% | 14 |
2 | 599992 | 20.0% | 14 | ||||
3 | 599980 | 20.0% | 14 | ||||
4 | 599992 | 20.0% | 14 | ||||
5 | 599993 | 20.0% | 14 | ||||
14 | |||||||
19 | image_id | numeric | 0 (0.0%) | [1, 300000] | 2999943 | 15 | |
15 | |||||||
20 | image_name | character | 0 (0.0%) | Reef1 Site 1 Transect 1 2016-01-01 1 | 10 | 0.0% | 16 |
Reef1 Site 1 Transect 1 2016-01-01 10 | 10 | 0.0% | 16 | ||||
Reef1 Site 1 Transect 1 2016-01-01 100 | 10 | 0.0% | 16 | ||||
Reef1 Site 1 Transect 1 2016-01-01 11 | 10 | 0.0% | 16 | ||||
Reef1 Site 1 Transect 1 2016-01-01 12 | 10 | 0.0% | 16 | ||||
Reef1 Site 1 Transect 1 2016-01-01 13 | 10 | 0.0% | 16 | ||||
Reef1 Site 1 Transect 1 2016-01-01 14 | 10 | 0.0% | 16 | ||||
Reef1 Site 1 Transect 1 2016-01-01 15 | 10 | 0.0% | 16 | ||||
Reef1 Site 1 Transect 1 2016-01-01 16 | 10 | 0.0% | 16 | ||||
Reef1 Site 1 Transect 1 2016-01-01 17 | 10 | 0.0% | 16 | ||||
(…) | NA | NA | 16 | ||||
16 | |||||||
21 | image_quality | numeric | 0 (0.0%) | 100 | 2999943 | 100.0% | 17 |
17 | |||||||
22 | point_id | numeric | 0 (0.0%) | [1, 1.49994e+06] | 2999943 | 18 | |
18 | |||||||
23 | point_no | numeric | 0 (0.0%) | 1 | 600714 | 20.0% | 19 |
2 | 600000 | 20.0% | 19 | ||||
3 | 600000 | 20.0% | 19 | ||||
4 | 600000 | 20.0% | 19 | ||||
5 | 599229 | 20.0% | 19 | ||||
19 | |||||||
24 | point_machine_classification | character | 0 (0.0%) | HCC | 1046658 | 34.9% | 20 |
MA | 1776443 | 59.2% | 20 | ||||
SC | 176842 | 5.9% | 20 | ||||
20 |
4 Excluding extraneous fields
As these are synthetic data, not all the typical reefCloud fields are present. Nevertheless, there are still a large number of fields (columns) in this dataset, many of which we are going to ignore for this exercise. The important fields are:
site_id
- a unique identifier of the sitesite_name
- a unique name of the sitesite latitude
- latitude of the sitesite_longitude
- longitude of the sitesurvey_start_date
- date (and time) of surveysurvey_depth
- depth at which the survey took placesurvey_transect_number
- unique identifier of the transectimage_id
- unique identifier of the imageimage_quality
- indication of the quality of the imagepoint_id
- unique identifier of the pointpoint_num
- the number of the point within the imagepoint_machine_classification
- classification determined by AI
Although it is often harmless enough to retain the other fields, it does make reviewing the data more combersum, so at an early stage within this exercise, we will probably restrict the data to just the above fields.
data_fixed <- data_fixed |>
dplyr::select(site_id,
site_name,
site_latitude,
site_longitude,
survey_start_date,
survey_depth,
survey_transect_number,
image_id,
image_quality,
point_id,
point_no,
point_machine_classification
)
data_fixed |> head()
# A tibble: 6 × 12
site_id site_name site_latitude site_longitude survey_start_date
<dbl> <chr> <dbl> <dbl> <dttm>
1 1 Reef118 Site 1 -20.3 3.96 2010-01-01 04:00:00
2 1 Reef118 Site 1 -20.3 3.96 2010-01-01 04:00:00
3 1 Reef118 Site 1 -20.3 3.96 2010-01-01 04:00:00
4 1 Reef118 Site 1 -20.3 3.96 2010-01-01 04:00:00
5 1 Reef118 Site 1 -20.3 3.96 2010-01-01 04:00:00
6 1 Reef118 Site 1 -20.3 3.96 2010-01-01 04:00:00
# ℹ 7 more variables: survey_depth <dbl>, survey_transect_number <dbl>,
# image_id <dbl>, image_quality <dbl>, point_id <dbl>, point_no <dbl>,
# point_machine_classification <chr>
data_random <- data_random |>
dplyr::select(site_id,
site_name,
site_latitude,
site_longitude,
survey_start_date,
survey_depth,
survey_transect_number,
image_id,
image_quality,
point_id,
point_no,
point_machine_classification
)
data_random |> head()
# A tibble: 6 × 12
site_id site_name site_latitude site_longitude survey_start_date
<dbl> <chr> <dbl> <dbl> <dttm>
1 1 Reef1 Site 1 -19.9 9.82 2016-01-01 04:00:00
2 1 Reef1 Site 1 -19.9 9.82 2016-01-01 04:00:00
3 1 Reef1 Site 1 -19.9 9.82 2016-01-01 04:00:00
4 1 Reef1 Site 1 -19.9 9.82 2016-01-01 04:00:00
5 1 Reef1 Site 1 -19.9 9.82 2016-01-01 04:00:00
6 1 Reef1 Site 1 -19.9 9.82 2016-01-01 04:00:00
# ℹ 7 more variables: survey_depth <dbl>, survey_transect_number <dbl>,
# image_id <dbl>, image_quality <dbl>, point_id <dbl>, point_no <dbl>,
# point_machine_classification <chr>
5 Excluding poor images
6 Lengthen the data
To facilitate most graphical and statistical modelling routines, data must be structured such that each row represents an individual record and that the variables are in columns.
7 Joining to the code group lookup data
Within reefCloud, the taxonomic resolution of point classification depends on the granularity of training label sets. For many analyses (such as spatio-temporal modelling of hard coral cover), this is overly granular. In order to group the taxonomic levels up to the larger groups (such as hard coral, soft coral and macroalgae), it is necessary to join the data to a lookup table representing the labelsets. However, the current synthetic data were only constructed to the broad categories (hard coral, soft coral and macroalgae) in the first place.
Although the current data are already recorded in the desired taxonomic resolution, for code completeness, we will still join in the labelset data (which we will first generate here).
Rows: 3 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): CODE, DESCRIPTION, FUNCTIONAL GROUP
lgl (1): KEYBOARD SHORTCUT CODE
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
spc_tbl_ [3 × 4] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ CODE : chr [1:3] "HCC" "SC" "MA"
$ DESCRIPTION : chr [1:3] "Hard coral" "Soft coral" "Macroalgae"
$ FUNCTIONAL GROUP : chr [1:3] "Hard coral" "Soft coral" "Macroalgae"
$ KEYBOARD SHORTCUT CODE: logi [1:3] NA NA NA
- attr(*, "spec")=
.. cols(
.. CODE = col_character(),
.. DESCRIPTION = col_character(),
.. `FUNCTIONAL GROUP` = col_character(),
.. `KEYBOARD SHORTCUT CODE` = col_logical()
.. )
- attr(*, "problems")=<externalptr>
Warning: Following 1 columns were empty and have been removed:
KEYBOARD SHORTCUT CODE
ID | Name | Type | Missings | Values | N | Prop | .row_id |
---|---|---|---|---|---|---|---|
1 | CODE | character | 0 (0.0%) | HCC | 1 | 33.3% | 1 |
MA | 1 | 33.3% | 1 | ||||
SC | 1 | 33.3% | 1 | ||||
1 | |||||||
2 | DESCRIPTION | character | 0 (0.0%) | Hard coral | 1 | 33.3% | 2 |
Macroalgae | 1 | 33.3% | 2 | ||||
Soft coral | 1 | 33.3% | 2 | ||||
2 | |||||||
3 | FUNCTIONAL GROUP | character | 0 (0.0%) | Hard coral | 1 | 33.3% | 3 |
Macroalgae | 1 | 33.3% | 3 | ||||
Soft coral | 1 | 33.3% | 3 | ||||
3 |
data_fixed <-
data_fixed |>
left_join(labelset |>
dplyr::select(CODE, GROUP = `FUNCTIONAL GROUP`),
by = c("classification" = "CODE")
)
data_fixed |> as.data.frame() |> head()
site_id site_name site_latitude site_longitude survey_start_date
1 1 Reef118 Site 1 -20.27968 3.959314 2010-01-01 04:00:00
2 1 Reef118 Site 1 -20.27968 3.959314 2010-01-01 04:00:00
3 1 Reef118 Site 1 -20.27968 3.959314 2010-01-01 04:00:00
4 1 Reef118 Site 1 -20.27968 3.959314 2010-01-01 04:00:00
5 1 Reef118 Site 1 -20.27968 3.959314 2010-01-01 04:00:00
6 1 Reef118 Site 1 -20.27968 3.959314 2010-01-01 04:00:00
survey_depth survey_transect_number image_id image_quality point_id point_no
1 3 1 1 100 1 1
2 3 1 1 100 2 2
3 3 1 1 100 3 3
4 3 1 1 100 4 4
5 3 1 1 100 5 5
6 3 1 13 100 61 1
type classification GROUP
1 point_machine_classification MA Macroalgae
2 point_machine_classification MA Macroalgae
3 point_machine_classification HCC Hard coral
4 point_machine_classification MA Macroalgae
5 point_machine_classification MA Macroalgae
6 point_machine_classification MA Macroalgae
data_random <-
data_random |>
left_join(labelset |>
dplyr::select(CODE, GROUP = `FUNCTIONAL GROUP`),
by = c("classification" = "CODE")
)
data_random |> as.data.frame() |> head()
site_id site_name site_latitude site_longitude survey_start_date
1 1 Reef1 Site 1 -19.90968 9.819314 2016-01-01 04:00:00
2 1 Reef1 Site 1 -19.90968 9.819314 2016-01-01 04:00:00
3 1 Reef1 Site 1 -19.90968 9.819314 2016-01-01 04:00:00
4 1 Reef1 Site 1 -19.90968 9.819314 2016-01-01 04:00:00
5 1 Reef1 Site 1 -19.90968 9.819314 2016-01-01 04:00:00
6 1 Reef1 Site 1 -19.90968 9.819314 2016-01-01 04:00:00
survey_depth survey_transect_number image_id image_quality point_id point_no
1 3 1 1 100 1 1
2 3 1 1 100 2 2
3 3 1 1 100 3 3
4 3 1 1 100 4 4
5 3 1 1 100 5 5
6 3 1 13 100 61 1
type classification GROUP
1 point_machine_classification SC Soft coral
2 point_machine_classification HCC Hard coral
3 point_machine_classification MA Macroalgae
4 point_machine_classification HCC Hard coral
5 point_machine_classification HCC Hard coral
6 point_machine_classification HCC Hard coral
8 Tally up points
Count the number of points of each type as well as sum up the total number of points per image.
9 Recode transects
10 Fill in any gaps
Since the data represent the classification of points in images, they only include what was present, not what was also absent. For example, if all five points are Algae, then this also means that all other functional groups are absent - yet this information is lacking in the data. For modelling purposes it is vital that we fill in all the zero values.
To do so, we must create a data set that contains every GROUP in every IMAGE.
GROUPS <- data_fixed |> pull(GROUP) |> unique()
data.filler <- data_fixed |>
dplyr::select(
starts_with("site"),
survey_start_date,
survey_depth,
transect_id,
image_id,
type,
TOTAL) |>
distinct() |>
tidyr::crossing(GROUP = GROUPS)
data_fixed <-
data_fixed |>
full_join(data.filler) |>
group_by(
across(c(starts_with("site"),
survey_start_date,
survey_depth,
transect_id,
image_id,
type,
GROUP
))) |>
mutate(COUNT = ifelse(is.na(COUNT), 0, COUNT),
TOTAL = max(TOTAL, na.rm = TRUE)
)
Joining with `by = join_by(site_id, site_name, site_latitude, site_longitude,
survey_start_date, survey_depth, type, image_id, GROUP, TOTAL, transect_id)`
GROUPS <- data_random |> pull(GROUP) |> unique()
data.filler <- data_random |>
dplyr::select(
starts_with("site"),
survey_start_date,
survey_depth,
transect_id,
image_id,
type,
TOTAL) |>
distinct() |>
tidyr::crossing(GROUP = GROUPS)
data_random <-
data_random |>
full_join(data.filler) |>
group_by(
across(c(starts_with("site"),
survey_start_date,
survey_depth,
transect_id,
image_id,
type,
GROUP
))) |>
mutate(COUNT = ifelse(is.na(COUNT), 0, COUNT),
TOTAL = max(TOTAL, na.rm = TRUE)
)
Joining with `by = join_by(site_id, site_name, site_latitude, site_longitude,
survey_start_date, survey_depth, type, image_id, GROUP, TOTAL, transect_id)`
11 Sum to transect level
data_fixed <-
data_fixed |>
ungroup(image_id) |>
summarise(COUNT = sum(COUNT),
TOTAL = sum(TOTAL)
) |>
ungroup() |>
droplevels()
`summarise()` has grouped output by 'site_id', 'site_name', 'site_latitude',
'site_longitude', 'survey_start_date', 'survey_depth', 'transect_id', 'type'.
You can override using the `.groups` argument.
data_random <-
data_random |>
ungroup(image_id) |>
summarise(COUNT = sum(COUNT),
TOTAL = sum(TOTAL)
) |>
ungroup() |>
droplevels()
`summarise()` has grouped output by 'site_id', 'site_name', 'site_latitude',
'site_longitude', 'survey_start_date', 'survey_depth', 'transect_id', 'type'.
You can override using the `.groups` argument.
12 Generate a year field
13 Generate a reef id
14 Visualisations
The end