From 1cb4f1e217442b7b88dba654087d1d949c679e1d Mon Sep 17 00:00:00 2001
From: David Frantz <david.frantz@uni-trier.de>
Date: Fri, 26 Jan 2024 14:54:08 +0100
Subject: [PATCH 01/23] 1st draft sample size computation

---
 rstats/force-sample-size.r | 98 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 rstats/force-sample-size.r

diff --git a/rstats/force-sample-size.r b/rstats/force-sample-size.r
new file mode 100644
index 00000000..a368b3e0
--- /dev/null
+++ b/rstats/force-sample-size.r
@@ -0,0 +1,98 @@
+#!/usr/bin/env Rscript
+
+# load libraries ####################################################
+library(dplyr)
+
+
+# input #############################################################
+args   <- commandArgs(trailingOnly = TRUE)
+n_args <- 3
+
+if (length(args) != n_args) {
+  c(
+    "\nWrong input!\n",
+    " 1: path_hist\n",
+    " 2: file_output\n"
+  ) %>%
+  stop()
+}
+
+
+library(dplyr)
+
+standard_error <- 0.01
+minsize <- 175
+
+pixel_counts <- data.frame(
+  value = 0:2, 
+  count = c(
+    1868242749,
+    496472240,
+    502851998
+    )
+)
+
+users_accuracy <- data.frame(
+  value = 0:2, 
+  UA = c(
+    0.9,
+    0.9,
+    0.9
+  )
+) 
+
+df <- users_accuracy %>% 
+  inner_join(
+    pixel_counts,
+    by = "value"
+  )
+
+if (nrow(df) != nrow(pixel_counts)){
+  "input tables do not match\n"
+  stop()
+}
+
+df <- df %>% 
+  mutate(proportion = count / sum(count)) %>%
+  mutate(stdev = sqrt(UA*(1-UA))) %>%
+  mutate(WS = proportion * stdev)
+
+number <- (sum(df$WS)/standard_error)**2
+number
+
+df <- df %>% 
+  mutate(equal = round(number / nrow(df))) %>%
+  mutate(proportional = round(number * proportion)) %>%
+  mutate(compromise = NA)
+
+if (min(df$proportional) < minsize){
+  
+  rare <- df %>% 
+    filter(proportional < minsize) %>% 
+    mutate(compromise = minsize)
+  
+  n_rare <- sum(rare$compromise)
+  
+  big <- df %>% 
+    filter(proportional >= minsize) %>% 
+    mutate(compromise = (number-n_rare) * proportion)
+
+  if (any(big$compromise < minsize)){
+    c(
+      "compromising the sample allocation failed...\n",
+      "either decrease minsize or adjust input parameters to \n",
+      "have more samples overall."
+      )
+  }
+
+  df <- rbind(rare, big)
+
+}
+
+df %>%
+    mutate(deviation = (compromise - proportional) / proportional * 100)
+
+write.csv(df, file_output, row.names = FALSE, quote = FALSE)
+
+
+

From 93e99bf9733ffe1a2f1e1ee1289d1803fefe143a Mon Sep 17 00:00:00 2001
From: David Frantz <david.frantz@uni-trier.de>
Date: Sat, 27 Jan 2024 12:42:44 +0100
Subject: [PATCH 02/23] added license

---
 src/aux-level/_hist.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/aux-level/_hist.c b/src/aux-level/_hist.c
index 802fc967..c4e4d992 100644
--- a/src/aux-level/_hist.c
+++ b/src/aux-level/_hist.c
@@ -3,7 +3,7 @@
 This file is part of FORCE - Framework for Operational Radiometric 
 Correction for Environmental monitoring.
 
-Copyright (C) 2013-2022 David Frantz
+Copyright (C) 2013-2024 David Frantz
 
 FORCE is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by

From b198df75096398c64f4262641ed7153faeed3878 Mon Sep 17 00:00:00 2001
From: David Frantz <david.frantz@uni-trier.de>
Date: Sat, 27 Jan 2024 12:42:59 +0100
Subject: [PATCH 03/23] getopt

---
 rstats/force-sample-size.r | 132 ++++++++++++++++++++++++++++++++++---
 1 file changed, 122 insertions(+), 10 deletions(-)

diff --git a/rstats/force-sample-size.r b/rstats/force-sample-size.r
index a368b3e0..60a795a7 100644
--- a/rstats/force-sample-size.r
+++ b/rstats/force-sample-size.r
@@ -1,24 +1,136 @@
 #!/usr/bin/env Rscript
 
+# This file is part of FORCE - Framework for Operational Radiometric 
+# Correction for Environmental monitoring.
+# 
+# Copyright (C) 2013-2024 David Frantz
+# 
+# FORCE is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# 
+# FORCE is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with FORCE.  If not, see <http://www.gnu.org/licenses/>.
+
+
 # load libraries ####################################################
 library(dplyr)
+library(getopt)
 
 
 # input #############################################################
-args   <- commandArgs(trailingOnly = TRUE)
-n_args <- 3
 
-if (length(args) != n_args) {
-  c(
-    "\nWrong input!\n",
-    " 1: path_hist\n",
-    " 2: file_output\n"
-  ) %>%
-  stop()
+# usage function in same style as other FORCE tools,
+# do not use getop's built-in for consistency
+usage <- function(exit){
+
+  message <- c(
+    sprintf(
+      "Usage: %s [-h] [-v] [-i] [-e error] [-s min_size] [-o output-file] -c count-file -u user_acc-file\n", 
+      get_Rscript_filename()
+    ),
+    "\n",
+    "  -h  = show this help\n",
+    "  -v  = show version\n",
+    "  -i  = show program's purpose\n",
+    "\n",
+    "  -e error = standard error, defaults to 0.01\n",
+    "\n",
+    "  -s min_size = minimum sample size per class\n",
+    "\n",
+    "  -o output-file  = output file path with extension,\n",
+    "     defaults to './sample-size.csv'\n",
+    "\n",
+    "  -c count-file  = csv table with pixel counts per class\n",
+    "\n",
+    "  -u user_acc-file  = csv table with expected user accuracy per class\n",
+    "\n",
+
+  )
+  
+  cat(
+    message,
+    file = if (exit == 0) stdout else stderr()
+  )
+  
+  quit(
+    save = "no",
+    status = exit
+  )
+
 }
 
+version <- function() {
+  cat("Printing function not implemented yet. Sorry.\n")
+  quit(
+    save = "no",
+    status = exit
+  )
+}
+
+info <- function() {
+  cat("Suggest sample size for validation of a classification.\n")
+  quit(
+    save = "no",
+    status = exit
+  )
+}
+
+mandatory_missing <- function(argument) {
+  cat(
+    sprintf("%s is missing!\n", argument), 
+    file = stderr()
+  )
+  usage(1)
+}
+
+file_existing <- function(path) {
+  if (!file.exist(path)){
+    cat(
+      sprintf("file %s does not exist\n", path),
+      file = stderr()
+    )
+    usage(1)
+  }
+}
+
+spec <- matrix(
+  c(
+    "help",     "h", 0, "logical",
+    "version",  "v", 0, "logical",
+    "info",     "i", 0, "logical",
+    "error",    "e", 2, "numeric",
+    "min_size", "s", 2, "integer",
+    "counts",   "c", 1, "character",
+    "user_acc", "u", 1, "character",
+    "output",   "o", 1, "character"
+  ), 
+  byrow = TRUE, 
+  ncol = 4
+)
+
+opt <- getopt(spec)
+
+if (!is.null(opt$help)) usage()
+if (!is.null(opt$info)) info()
+if (!is.null(opt$version)) version()
+
+if (is.null(opt$counts)) mandatory_missing("count-file")
+if (is.null(opt$user_acc)) mandatory_missing("user_acc-file")
+
+file_existing(opt$counts)
+file_existing(opt$user_acc)
+
+if (is.null(opt$error)) opt$error <- 0.01
+if (is.null(opt$min_size)) opt$min_size <- 50
+if (is.null(opt$output)) opt$output <- "sample-size.csv"
 
-library(dplyr)
 
 standard_error <- 0.01
 minsize <- 175

From 0c90f65832e731f23888de802974fa6d56b6aba5 Mon Sep 17 00:00:00 2001
From: David Frantz <david.frantz@uni-trier.de>
Date: Sat, 27 Jan 2024 16:26:59 +0100
Subject: [PATCH 04/23] tidied up

---
 rstats/force-sample-size.r | 123 ++++++++++++++++++-------------------
 1 file changed, 60 insertions(+), 63 deletions(-)

diff --git a/rstats/force-sample-size.r b/rstats/force-sample-size.r
index 60a795a7..3be99b91 100644
--- a/rstats/force-sample-size.r
+++ b/rstats/force-sample-size.r
@@ -27,7 +27,7 @@ library(getopt)
 # input #############################################################
 
 # usage function in same style as other FORCE tools,
-# do not use getop's built-in for consistency
+# do not use getop's built-in usage function for consistency
 usage <- function(exit){
 
   message <- c(
@@ -48,8 +48,10 @@ usage <- function(exit){
     "     defaults to './sample-size.csv'\n",
     "\n",
     "  -c count-file  = csv table with pixel counts per class\n",
+    "     2 columns named value and count",
     "\n",
     "  -u user_acc-file  = csv table with expected user accuracy per class\n",
+    "     2 columns named value and UA",
     "\n",
 
   )
@@ -66,25 +68,19 @@ usage <- function(exit){
 
 }
 
-version <- function() {
-  cat("Printing function not implemented yet. Sorry.\n")
-  quit(
-    save = "no",
-    status = exit
+exit_normal <- function(argument) {
+  cat(
+    sprintf("%s\n", argument)
   )
-}
-
-info <- function() {
-  cat("Suggest sample size for validation of a classification.\n")
   quit(
     save = "no",
-    status = exit
+    status = 0
   )
 }
 
-mandatory_missing <- function(argument) {
+exit_with_error <- function(argument) {
   cat(
-    sprintf("%s is missing!\n", argument), 
+    sprintf("%s\n", argument), 
     file = stderr()
   )
   usage(1)
@@ -118,11 +114,11 @@ spec <- matrix(
 opt <- getopt(spec)
 
 if (!is.null(opt$help)) usage()
-if (!is.null(opt$info)) info()
-if (!is.null(opt$version)) version()
+if (!is.null(opt$info)) exit_normal("Suggest sample size for validation of a classification.\n")
+if (!is.null(opt$version)) exit_normal("Printing function not implemented yet. Sorry.\n")
 
-if (is.null(opt$counts)) mandatory_missing("count-file")
-if (is.null(opt$user_acc)) mandatory_missing("user_acc-file")
+if (is.null(opt$counts)) exit_with_error("count-file is missing!")
+if (is.null(opt$user_acc)) exit_with_error("user_acc-file is missing!")
 
 file_existing(opt$counts)
 file_existing(opt$user_acc)
@@ -132,79 +128,80 @@ if (is.null(opt$min_size)) opt$min_size <- 50
 if (is.null(opt$output)) opt$output <- "sample-size.csv"
 
 
-standard_error <- 0.01
-minsize <- 175
+pixel_counts   <- read.csv(opt$counts)
+users_accuracy <- read.csv(opt$user_acc)
 
-pixel_counts <- data.frame(
-  value = 0:2, 
-  count = c(
-    1868242749,
-    496472240,
-    502851998
-    )
-)
 
-users_accuracy <- data.frame(
-  value = 0:2, 
-  UA = c(
-    0.9,
-    0.9,
-    0.9
-  )
-) 
+# main thing ########################################################
 
-df <- users_accuracy %>% 
+# join input tables
+table <- users_accuracy %>% 
   inner_join(
     pixel_counts,
     by = "value"
   )
 
-if (nrow(df) != nrow(pixel_counts)){
-  "input tables do not match\n"
-  stop()
+# join worked?
+if (nrow(table) != nrow(pixel_counts)){
+  exit_with_error("count and user_acc could not be joined")
 }
 
-df <- df %>% 
-  mutate(proportion = count / sum(count)) %>%
+# compute proportional area, and standard deviation of UA
+table <- table %>% 
+  mutate(area = count / sum(count)) %>%
   mutate(stdev = sqrt(UA*(1-UA))) %>%
-  mutate(WS = proportion * stdev)
+  mutate(areaXstdev = area * stdev)
 
-number <- (sum(df$WS)/standard_error)**2
-number
+# number of recommended samples
+number <- (sum(table$areaXstdev)/standard_error)**2
 
-df <- df %>% 
-  mutate(equal = round(number / nrow(df))) %>%
-  mutate(proportional = round(number * proportion)) %>%
+sprintf("%d samples are suggested.\n", number) %>%
+  cat()
+
+# compute class-wise sample size for equal and proportional allocation
+table <- table %>% 
+  mutate(equal = round(number / nrow(table))) %>%
+  mutate(proportional = round(number * area)) %>%
   mutate(compromise = NA)
 
-if (min(df$proportional) < minsize){
-  
-  rare <- df %>% 
+
+# do we have enough samples in proportional allocation?
+if (min(table$proportional) < minsize){
+
+  cat("Proportional allocation yields too few samples.\n")
+  cat("A compromise between equal and proportional allocation is recommended.\n")
+
+  # first, assign minimum sample size to small classes
+  rare <- table %>% 
     filter(proportional < minsize) %>% 
     mutate(compromise = minsize)
   
   n_rare <- sum(rare$compromise)
   
-  big <- df %>% 
+  # allocate remaining samples to big classes proportionally
+  big <- table %>% 
     filter(proportional >= minsize) %>% 
-    mutate(compromise = (number-n_rare) * proportion)
+    mutate(compromise = (number-n_rare) * area)
 
+  # check if proportionally allocated classes are big enough
   if (any(big$compromise < minsize)){
-    c(
-      "compromising the sample allocation failed...\n",
-      "either decrease minsize or adjust input parameters to \n",
-      "have more samples overall."
-      )
+    exit_with_error("Compromising failed. Adjust input parameters.")
   }
 
-  df <- rbind(rare, big)
+  table <- rbind(rare, big)
 
+} else {
+  cat("Proportional allocation recommended.\n")
 }
 
-df %>%
+# compute deviation of compromised allocation from proportional in percent
+table <- table %>%
     mutate(deviation = (compromise - proportional) / proportional * 100)
 
-write.csv(df, file_output, row.names = FALSE, quote = FALSE)
-
-
-
+# write output
+write.csv(
+  table, 
+  opt$output, 
+  row.names = FALSE, 
+  quote = FALSE
+)

From 9ff118457c81dfff3f33ec07b44853080f9b1b8c Mon Sep 17 00:00:00 2001
From: David Frantz <david.frantz@uni-trier.de>
Date: Sun, 28 Jan 2024 13:30:23 +0100
Subject: [PATCH 05/23] worked on map accuracy script

---
 rstats/force-map-accuracy.r | 262 ++++++++++++++++++++++++++++++++++++
 1 file changed, 262 insertions(+)
 create mode 100644 rstats/force-map-accuracy.r

diff --git a/rstats/force-map-accuracy.r b/rstats/force-map-accuracy.r
new file mode 100644
index 00000000..6ee1dd16
--- /dev/null
+++ b/rstats/force-map-accuracy.r
@@ -0,0 +1,262 @@
+#!/usr/bin/env Rscript
+
+# This file is part of FORCE - Framework for Operational Radiometric 
+# Correction for Environmental monitoring.
+# 
+# Copyright (C) 2013-2024 David Frantz
+# 
+# FORCE is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# 
+# FORCE is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with FORCE.  If not, see <http://www.gnu.org/licenses/>.
+
+info <- "Compute map accuracy and area statistics.\n"
+
+# load libraries ####################################################
+library(dplyr)
+library(getopt)
+
+
+# input #############################################################
+
+# usage function in same style as other FORCE tools,
+# do not use getop's built-in usage function for consistency
+usage <- function(exit){
+
+  message <- c(
+    sprintf(
+      "Usage: %s [-h] [-v] [-i] [-o output-file] -c count-file -m map-file -r reference-file\n", 
+      get_Rscript_filename()
+    ),
+    "\n",
+    "  -h  = show this help\n",
+    "  -v  = show version\n",
+    "  -i  = show program's purpose\n",
+    "\n",
+    "  -o output-file  = output file path with extension,\n",
+    "     defaults to './accuracy-assessment.txt'\n",
+    "\n",
+    "  -c count-file  = csv table with pixel counts per class\n",
+    "     2 columns named class and count",
+    "\n",
+    "  -m map-file  = csv table with predicted class labels\n",
+    "     2 columns named ID and map",
+    "\n",
+    "  -r reference-file  = csv table with reference class labels\n",
+    "     2 columns named ID and reference",
+    "\n",
+
+  )
+  
+  cat(
+    message,
+    file = if (exit == 0) stdout else stderr()
+  )
+  
+  quit(
+    save = "no",
+    status = exit
+  )
+
+}
+
+exit_normal <- function(argument) {
+  cat(
+    sprintf("%s\n", argument)
+  )
+  quit(
+    save = "no",
+    status = 0
+  )
+}
+
+exit_with_error <- function(argument) {
+  cat(
+    sprintf("%s\n", argument), 
+    file = stderr()
+  )
+  usage(1)
+}
+
+file_existing <- function(path) {
+  if (!file.exist(path)){
+    cat(
+      sprintf("file %s does not exist\n", path),
+      file = stderr()
+    )
+    usage(1)
+  }
+}
+
+spec <- matrix(
+  c(
+    "help",      "h", 0, "logical",
+    "version",   "v", 0, "logical",
+    "info",      "i", 0, "logical",
+    "output",    "o", 2, "character",
+    "counts",    "c", 1, "character",
+    "map",       "m", 1, "character",
+    "reference", "r", 1, "character"
+  ), 
+  byrow = TRUE, 
+  ncol = 4
+)
+
+opt <- getopt(spec)
+
+if (!is.null(opt$help)) usage()
+if (!is.null(opt$info)) exit_normal(info)
+if (!is.null(opt$version)) exit_normal("Printing function not implemented yet. Sorry.\n")
+
+if (is.null(opt$counts)) exit_with_error("count-file is missing!")
+if (is.null(opt$map)) exit_with_error("map-file is missing!")
+if (is.null(opt$reference)) exit_with_error("reference-file is missing!")
+
+file_existing(opt$counts)
+file_existing(opt$map)
+file_existing(opt$reference)
+
+if (is.null(opt$output)) opt$output <- "accuracy-assessment.txt"
+
+
+cnt <- read.csv(opt$counts)
+map <- read.csv(opt$map)
+ref <- read.csv(opt$reference)
+
+# columns OK?
+if (!"ID" %in% colnames(map)) exit_with_error("ID column in map-file missing")
+if (!"ID" %in% colnames(ref)) exit_with_error("ID column in reference-file missing")
+if (!"map" %in% colnames(map)) exit_with_error("map column in map-file missing")
+if (!"reference" %in% colnames(ref)) exit_with_error("reference column in reference-file missing")
+if (!"class" %in% colnames(cnt)) exit_with_error("class column in count-file missing")
+if (!"count" %in% colnames(cnt)) exit_with_error("count column in count-file missing")
+
+
+# main thing ########################################################
+
+# join input tables
+table <- reference %>% 
+  inner_join(
+    map,
+    by = "ID"
+  )
+
+# join worked?
+if (nrow(table) != nrow(reference)){
+  exit_with_error("map and reference files could not be joined")
+}
+
+table <- data.frame(ID = 1:1000, map = round(runif(1000, 8, 12)), reference = round(runif(1000, 8, 12)))
+
+# get all classes
+classes <- c(
+    table$map, 
+    table$reference
+  ) %>%
+  unique() %>%
+  sort()
+n_classes <- length(classes)
+
+
+# initialize confusion matrix
+confusion_counts <- matrix(
+  NA, 
+  n_classes, 
+  n_classes, 
+  dimnames = list(
+    map = classes, 
+    reference = classes
+  )
+)
+
+# populate confusion matrix
+for (m in 1:n_classes){
+  for (r in 1:n_classes){
+
+    confusion_counts[m, r] <- 
+      table %>%
+      filter(
+        map == classes[m] & 
+        reference == classes[r]
+      ) %>%
+      nrow()
+
+  }
+}
+
+
+acc_metrics <- function(confusion_matrix) {
+
+  sum_all <- sum(confusion_matrix)
+  sum_map_class <- rowSums(confusion_matrix)
+  sum_ref_class <- colSums(confusion_matrix)
+
+  oa <- confusion_matrix %>%
+    diag() %>%
+    sum() %>%
+    `/`(sum_all)
+  oa
+
+  pa <- confusion_matrix %>%
+    diag() %>%
+    `/`(sum_ref_class)
+  pa
+
+  ua <- confusion_matrix %>%
+    diag() %>%
+    `/`(sum_map_class)
+  ua
+
+  oe <- 1 - pa
+  ce <- 1 - ua
+
+
+  list(
+    oa = oa,
+    pa = pa,
+    ua = ua,
+    oe = oe,
+    ce = ce
+  ) %>%
+  return()
+
+}
+
+
+
+cnt <- data.frame(class = 12:8, count = runif(5, 1e4, 1e6))
+
+cnt <- cnt %>% 
+  arrange(class) %>%
+  mutate(weight = count / sum(count))
+
+if (!any(cnt$class == classes)) exit_with_error("classes in file-count do not match with map or reference classes")
+
+confusion_adjusted <-
+  confusion_counts /
+  matrix(
+    rowSums(confusion_counts),
+    n_classes,
+    n_classes,
+    byrow = FALSE
+  ) *
+  matrix(
+    cnt$weight,
+    n_classes,
+    n_classes,
+    byrow = FALSE
+  )
+
+confusion_counts
+confusion_adjusted
+
+acc_metrics(confusion_counts)
+acc_metrics(confusion_adjusted)

From dfa8793697da1bb4bc44f68c1b596a2495341f73 Mon Sep 17 00:00:00 2001
From: David Frantz <david.frantz@uni-trier.de>
Date: Sun, 28 Jan 2024 13:30:47 +0100
Subject: [PATCH 06/23] some refinement

---
 rstats/force-sample-size.r | 30 +++++++++++++++++++-----------
 src/aux-level/_hist.c      |  2 +-
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/rstats/force-sample-size.r b/rstats/force-sample-size.r
index 3be99b91..f271f350 100644
--- a/rstats/force-sample-size.r
+++ b/rstats/force-sample-size.r
@@ -18,6 +18,7 @@
 # You should have received a copy of the GNU General Public License
 # along with FORCE.  If not, see <http://www.gnu.org/licenses/>.
 
+info <- "Suggest sample size for estimating area and accuracy of a classification.\n"
 
 # load libraries ####################################################
 library(dplyr)
@@ -40,18 +41,18 @@ usage <- function(exit){
     "  -v  = show version\n",
     "  -i  = show program's purpose\n",
     "\n",
-    "  -e error = standard error, defaults to 0.01\n",
+    "  -e error = standard error of the estimated overall accuracy (default: 0.01)\n",
     "\n",
-    "  -s min_size = minimum sample size per class\n",
+    "  -s min_size = minimum sample size per class (default: 50)\n",
     "\n",
     "  -o output-file  = output file path with extension,\n",
     "     defaults to './sample-size.csv'\n",
     "\n",
     "  -c count-file  = csv table with pixel counts per class\n",
-    "     2 columns named value and count",
+    "     2 columns named class and count",
     "\n",
     "  -u user_acc-file  = csv table with expected user accuracy per class\n",
-    "     2 columns named value and UA",
+    "     2 columns named class and UA",
     "\n",
 
   )
@@ -103,9 +104,9 @@ spec <- matrix(
     "info",     "i", 0, "logical",
     "error",    "e", 2, "numeric",
     "min_size", "s", 2, "integer",
+    "output",   "o", 2, "character",
     "counts",   "c", 1, "character",
-    "user_acc", "u", 1, "character",
-    "output",   "o", 1, "character"
+    "user_acc", "u", 1, "character"
   ), 
   byrow = TRUE, 
   ncol = 4
@@ -114,7 +115,7 @@ spec <- matrix(
 opt <- getopt(spec)
 
 if (!is.null(opt$help)) usage()
-if (!is.null(opt$info)) exit_normal("Suggest sample size for validation of a classification.\n")
+if (!is.null(opt$info)) exit_normal(info)
 if (!is.null(opt$version)) exit_normal("Printing function not implemented yet. Sorry.\n")
 
 if (is.null(opt$counts)) exit_with_error("count-file is missing!")
@@ -131,6 +132,12 @@ if (is.null(opt$output)) opt$output <- "sample-size.csv"
 pixel_counts   <- read.csv(opt$counts)
 users_accuracy <- read.csv(opt$user_acc)
 
+# columns OK?
+if (!"class" %in% colnames(pixel_counts)) exit_with_error("class column in count-file missing")
+if (!"class" %in% colnames(users_accuracy)) exit_with_error("class column in user_acc-file missing")
+if (!"count" %in% colnames(pixel_counts)) exit_with_error("count column in count-file missing")
+if (!"UA" %in% colnames(user_acc)) exit_with_error("UA column in user_acc-file missing")
+
 
 # main thing ########################################################
 
@@ -138,7 +145,7 @@ users_accuracy <- read.csv(opt$user_acc)
 table <- users_accuracy %>% 
   inner_join(
     pixel_counts,
-    by = "value"
+    by = "class"
   )
 
 # join worked?
@@ -155,7 +162,7 @@ table <- table %>%
 # number of recommended samples
 number <- (sum(table$areaXstdev)/standard_error)**2
 
-sprintf("%d samples are suggested.\n", number) %>%
+sprintf("Suggested sample size: %d\n", number) %>%
   cat()
 
 # compute class-wise sample size for equal and proportional allocation
@@ -183,13 +190,14 @@ if (min(table$proportional) < minsize){
     filter(proportional >= minsize) %>% 
     mutate(compromise = (number-n_rare) * area)
 
+  table <- rbind(rare, big)
+
   # check if proportionally allocated classes are big enough
   if (any(big$compromise < minsize)){
+    print(table)
     exit_with_error("Compromising failed. Adjust input parameters.")
   }
 
-  table <- rbind(rare, big)
-
 } else {
   cat("Proportional allocation recommended.\n")
 }
diff --git a/src/aux-level/_hist.c b/src/aux-level/_hist.c
index c4e4d992..61c9e00b 100644
--- a/src/aux-level/_hist.c
+++ b/src/aux-level/_hist.c
@@ -203,7 +203,7 @@ FILE *fout = NULL;
     fprintf(stderr, "Unable to open output file %s\n", args.file_output); 
     return FAILURE;}
 
-  fprintf(fout, "value,count\n");
+  fprintf(fout, "class,count\n");
 
   for (i=0; i<length; i++){
 

From 29164ae0d9ff856ae8c8655bfa47d164dd20be99 Mon Sep 17 00:00:00 2001
From: David Frantz <david.frantz@uni-trier.de>
Date: Sun, 28 Jan 2024 16:33:35 +0100
Subject: [PATCH 07/23] cont. implementation

---
 rstats/force-map-accuracy.r | 136 ++++++++++++++++++++++++++++--------
 1 file changed, 106 insertions(+), 30 deletions(-)

diff --git a/rstats/force-map-accuracy.r b/rstats/force-map-accuracy.r
index 6ee1dd16..20bb8342 100644
--- a/rstats/force-map-accuracy.r
+++ b/rstats/force-map-accuracy.r
@@ -33,9 +33,10 @@ usage <- function(exit){
 
   message <- c(
     sprintf(
-      "Usage: %s [-h] [-v] [-i] [-o output-file] -c count-file -m map-file -r reference-file\n", 
+      "Usage: %s [-h] [-v] [-i] [-o output-file] -c count-file \n", 
       get_Rscript_filename()
     ),
+    "       -m map-file -r reference-file -a pixel-area\n",
     "\n",
     "  -h  = show this help\n",
     "  -v  = show version\n",
@@ -53,6 +54,10 @@ usage <- function(exit){
     "  -r reference-file  = csv table with reference class labels\n",
     "     2 columns named ID and reference",
     "\n",
+    "  -a pixel-area  = area of one pixel in desired reporting unit, e.g.\n",
+    "      100 for a Sentinel-2 based map to be reported in m², or\n",
+    "     0.01 for a Sentinel-2 based map to be reported in ha\n",
+    "\n",
 
   )
   
@@ -98,13 +103,14 @@ file_existing <- function(path) {
 
 spec <- matrix(
   c(
-    "help",      "h", 0, "logical",
-    "version",   "v", 0, "logical",
-    "info",      "i", 0, "logical",
-    "output",    "o", 2, "character",
-    "counts",    "c", 1, "character",
-    "map",       "m", 1, "character",
-    "reference", "r", 1, "character"
+    "help",       "h", 0, "logical",
+    "version",    "v", 0, "logical",
+    "info",       "i", 0, "logical",
+    "output",     "o", 2, "character",
+    "counts",     "c", 1, "character",
+    "map",        "m", 1, "character",
+    "reference",  "r", 1, "character",
+    "pixel_area", "a", 1, "numeric"
   ), 
   byrow = TRUE, 
   ncol = 4
@@ -119,6 +125,7 @@ if (!is.null(opt$version)) exit_normal("Printing function not implemented yet. S
 if (is.null(opt$counts)) exit_with_error("count-file is missing!")
 if (is.null(opt$map)) exit_with_error("map-file is missing!")
 if (is.null(opt$reference)) exit_with_error("reference-file is missing!")
+if (is.null(opt$area)) exit_with_error("pixel-area is missing!")
 
 file_existing(opt$counts)
 file_existing(opt$map)
@@ -143,7 +150,8 @@ if (!"count" %in% colnames(cnt)) exit_with_error("count column in count-file mis
 # main thing ########################################################
 
 # join input tables
-table <- reference %>% 
+table <- 
+  reference %>% 
   inner_join(
     map,
     by = "ID"
@@ -157,7 +165,8 @@ if (nrow(table) != nrow(reference)){
 table <- data.frame(ID = 1:1000, map = round(runif(1000, 8, 12)), reference = round(runif(1000, 8, 12)))
 
 # get all classes
-classes <- c(
+classes <- 
+  c(
     table$map, 
     table$reference
   ) %>%
@@ -167,15 +176,16 @@ n_classes <- length(classes)
 
 
 # initialize confusion matrix
-confusion_counts <- matrix(
-  NA, 
-  n_classes, 
-  n_classes, 
-  dimnames = list(
-    map = classes, 
-    reference = classes
+confusion_counts <- 
+  matrix(
+    NA, 
+    n_classes, 
+    n_classes, 
+    dimnames = list(
+      map = classes, 
+      reference = classes
+    )
   )
-)
 
 # populate confusion matrix
 for (m in 1:n_classes){
@@ -195,26 +205,30 @@ for (m in 1:n_classes){
 
 acc_metrics <- function(confusion_matrix) {
 
-  sum_all <- sum(confusion_matrix)
+  sum_all       <- sum(confusion_matrix)
   sum_map_class <- rowSums(confusion_matrix)
   sum_ref_class <- colSums(confusion_matrix)
 
-  oa <- confusion_matrix %>%
+  # overall accuracy
+  oa <- 
+    confusion_matrix %>%
     diag() %>%
     sum() %>%
     `/`(sum_all)
-  oa
 
-  pa <- confusion_matrix %>%
+  # producer's accuracy
+  pa <- 
+    confusion_matrix %>%
     diag() %>%
     `/`(sum_ref_class)
-  pa
 
-  ua <- confusion_matrix %>%
+  # user's accuracy
+  ua <- 
+    confusion_matrix %>%
     diag() %>%
     `/`(sum_map_class)
-  ua
 
+  # error of omission and commission
   oe <- 1 - pa
   ce <- 1 - ua
 
@@ -234,12 +248,18 @@ acc_metrics <- function(confusion_matrix) {
 
 cnt <- data.frame(class = 12:8, count = runif(5, 1e4, 1e6))
 
-cnt <- cnt %>% 
+# compute propoertional area per class, area in reporting unit, and sort the classes
+cnt <- 
+  cnt %>% 
   arrange(class) %>%
-  mutate(weight = count / sum(count))
+  mutate(weight = count / sum(count)) %>%
+  mutate(area = count * opt$pixel_area)
 
-if (!any(cnt$class == classes)) exit_with_error("classes in file-count do not match with map or reference classes")
+# classes should now align with map/reference dataset
+if (!any(cnt$class == classes))
+  exit_with_error("classes in file-count do not match with map or reference classes")
 
+# Olofsson et al. 2013, eq. 1
 confusion_adjusted <-
   confusion_counts /
   matrix(
@@ -258,5 +278,61 @@ confusion_adjusted <-
 confusion_counts
 confusion_adjusted
 
-acc_metrics(confusion_counts)
-acc_metrics(confusion_adjusted)
+
+# Olofsson et al. 2013, eq. 2
+area_adjusted <-
+  colSums(confusion_adjusted) * sum(cnt$area)
+
+
+
+proportions <- 
+  confusion_counts / 
+  matrix(
+    rowSums(confusion_counts),
+    n_classes,
+    n_classes,
+    byrow = FALSE
+  )
+
+
+# Olofsson et al. 2013, eq. 3-5
+confidence_area_adjusted <-
+{
+proportions * 
+(1 - proportions) / 
+matrix(
+  rowSums(confusion_counts) - 1,
+  n_classes,
+  n_classes,
+  byrow = FALSE
+) *
+matrix(
+  cnt$weight**2,
+  n_classes,
+  n_classes,
+  byrow = FALSE
+)
+ } %>%
+  rowSums() %>%
+  sqrt() %>%
+  `*`(sum(cnt$area)) %>%
+  `*`(1.96)
+confidence_area_adjusted
+
+
+# Olofsson et al. 2013, eq. 6-8
+acc_traditional <- acc_metrics(confusion_counts)
+acc_adjusted    <- acc_metrics(confusion_adjusted)
+
+# Olofsson et al. 2014, eq. 5
+oa_se <- sum(cnt$weight**2 * acc_adjusted$ua * (1 - acc_adjusted$ua) / (rowSums(confusion_counts) - 1)) %>%
+sqrt() %>%
+`*`(1.96)
+
+# Olofsson et al. 2014, eq. 6
+ua_se <- (acc_adjusted$ua * (1 - acc_adjusted$ua) / (rowSums(confusion_counts) - 1)) %>%
+sqrt() %>%
+`*`(1.96)
+
+# Olofsson et al. 2014, eq. 7
+pa_se <- # todo, looks comlicated

From a854843b5f6bdeb25003f43c0735141d1f715e30 Mon Sep 17 00:00:00 2001
From: David Frantz <david.frantz@uni-trier.de>
Date: Tue, 30 Jan 2024 09:19:30 +0100
Subject: [PATCH 08/23] area adjusted accuracies implemented

---
 rstats/force-map-accuracy.r | 104 ++++++++++++++++++++++++++++++++++--
 1 file changed, 99 insertions(+), 5 deletions(-)

diff --git a/rstats/force-map-accuracy.r b/rstats/force-map-accuracy.r
index 20bb8342..21e88637 100644
--- a/rstats/force-map-accuracy.r
+++ b/rstats/force-map-accuracy.r
@@ -60,12 +60,12 @@ usage <- function(exit){
     "\n",
 
   )
-  
+
   cat(
     message,
     file = if (exit == 0) stdout else stderr()
   )
-  
+
   quit(
     save = "no",
     status = exit
@@ -202,6 +202,22 @@ for (m in 1:n_classes){
   }
 }
 
+classes <- 1:4
+n_classes <- 4
+confusion_counts <- matrix(
+  c(
+    150,	12,	1,	2,
+    32,	100,	21,	3,
+    0,	32,	120,	0,
+    0,	0,	5,	130
+  ), byrow = TRUE,
+  n_classes, 
+    n_classes, 
+    dimnames = list(
+      map = classes, 
+      reference = classes
+    )
+)
 
 acc_metrics <- function(confusion_matrix) {
 
@@ -246,7 +262,8 @@ acc_metrics <- function(confusion_matrix) {
 
 
 
-cnt <- data.frame(class = 12:8, count = runif(5, 1e4, 1e6))
+cnt <- data.frame(class = 1:4, count = c(20000,200000,300000,350000))
+opt <- list(pixel_area = 30^2/10000) # ha
 
 # compute propoertional area per class, area in reporting unit, and sort the classes
 cnt <- 
@@ -313,7 +330,7 @@ matrix(
   byrow = FALSE
 )
  } %>%
-  rowSums() %>%
+  colSums() %>%
   sqrt() %>%
   `*`(sum(cnt$area)) %>%
   `*`(1.96)
@@ -335,4 +352,81 @@ sqrt() %>%
 `*`(1.96)
 
 # Olofsson et al. 2014, eq. 7
-pa_se <- # todo, looks comlicated
+
+
+Nj <-
+  {
+  matrix(
+    cnt$count,
+    n_classes,
+    n_classes,
+    byrow = FALSE
+  ) / 
+  matrix(
+    rowSums(confusion_counts),
+    n_classes,
+    n_classes,
+    byrow = FALSE
+  ) *
+  confusion_counts
+  } %>%
+  colSums()
+
+
+term1 <- cnt$count**2 *
+(1 - acc_adjusted$pa)**2 * 
+acc_adjusted$ua * 
+(1 - acc_adjusted$ua) / 
+(colSums(confusion_counts) - 1)
+
+leave_class_out <- 
+matrix(
+  1,  n_classes,
+    n_classes
+)
+diag(leave_class_out) <- 0
+
+term2 <- {
+matrix(
+  cnt$count**2,
+  n_classes,
+  n_classes,
+  byrow = FALSE
+) *
+confusion_counts /
+matrix(
+  rowSums(confusion_counts),
+   n_classes,
+    n_classes,
+    byrow = FALSE
+) *
+(
+  1 -
+  confusion_counts /
+  matrix(
+    rowSums(confusion_counts),
+    n_classes,
+      n_classes,
+      byrow = FALSE
+  )
+) /
+matrix(
+(rowSums(confusion_counts) - 1),
+   n_classes,
+    n_classes,
+    byrow = FALSE
+) *
+leave_class_out
+} %>%
+colSums() %>%
+`*`(acc_adjusted$pa**2)
+
+
+
+pa_se <- 
+{
+  (1/Nj**2) *
+  (term1 + term2)
+} %>%
+sqrt() %>%
+`*`(1.96)

From 19feb7f8996cc10e5b8203dd0ff674bb5d86f1e6 Mon Sep 17 00:00:00 2001
From: David Frantz <david.frantz@uni-trier.de>
Date: Wed, 31 Jan 2024 08:17:29 +0100
Subject: [PATCH 09/23] further development, u+x in rstats/*

---
 rstats/force-level2-report.Rmd     |   0
 rstats/force-map-accuracy.r        |   9 +-
 rstats/force-sample-size.r         |  61 ++++----
 src/aux-level/_stratified_sample.c | 230 +++++++++++++++++++++++++++++
 src/cross-level/read-cl.c          |   2 +-
 5 files changed, 272 insertions(+), 30 deletions(-)
 mode change 100644 => 100755 rstats/force-level2-report.Rmd
 mode change 100644 => 100755 rstats/force-map-accuracy.r
 mode change 100644 => 100755 rstats/force-sample-size.r
 create mode 100644 src/aux-level/_stratified_sample.c

diff --git a/rstats/force-level2-report.Rmd b/rstats/force-level2-report.Rmd
old mode 100644
new mode 100755
diff --git a/rstats/force-map-accuracy.r b/rstats/force-map-accuracy.r
old mode 100644
new mode 100755
index 21e88637..06b5e4f3
--- a/rstats/force-map-accuracy.r
+++ b/rstats/force-map-accuracy.r
@@ -263,7 +263,7 @@ acc_metrics <- function(confusion_matrix) {
 
 
 cnt <- data.frame(class = 1:4, count = c(20000,200000,300000,350000))
-opt <- list(pixel_area = 30^2/10000) # ha
+opt <- list(pixel_area = 30^2/10000, output = "accuracy-assessment.txt") # ha
 
 # compute propoertional area per class, area in reporting unit, and sort the classes
 cnt <- 
@@ -430,3 +430,10 @@ pa_se <-
 } %>%
 sqrt() %>%
 `*`(1.96)
+
+fo <- file(opt$output, "w")
+
+cat("# Accuracy assessment", file = fo)
+
+close(fo)
+
diff --git a/rstats/force-sample-size.r b/rstats/force-sample-size.r
old mode 100644
new mode 100755
index f271f350..3ede6d5f
--- a/rstats/force-sample-size.r
+++ b/rstats/force-sample-size.r
@@ -25,8 +25,13 @@ library(dplyr)
 library(getopt)
 
 
-# input #############################################################
+# program name ######################################################
+progname <<-
+  get_Rscript_filename() %>% 
+  basename()
+
 
+# input #############################################################
 # usage function in same style as other FORCE tools,
 # do not use getop's built-in usage function for consistency
 usage <- function(exit){
@@ -34,7 +39,7 @@ usage <- function(exit){
   message <- c(
     sprintf(
       "Usage: %s [-h] [-v] [-i] [-e error] [-s min_size] [-o output-file] -c count-file -u user_acc-file\n", 
-      get_Rscript_filename()
+      progname
     ),
     "\n",
     "  -h  = show this help\n",
@@ -53,15 +58,14 @@ usage <- function(exit){
     "\n",
     "  -u user_acc-file  = csv table with expected user accuracy per class\n",
     "     2 columns named class and UA",
-    "\n",
-
+    "\n"
   )
-  
+
   cat(
     message,
-    file = if (exit == 0) stdout else stderr()
+    file = if (exit == 0) stdout() else stderr()
   )
-  
+
   quit(
     save = "no",
     status = exit
@@ -81,14 +85,14 @@ exit_normal <- function(argument) {
 
 exit_with_error <- function(argument) {
   cat(
-    sprintf("%s\n", argument), 
+    sprintf("%s\n", argument),
     file = stderr()
   )
   usage(1)
 }
 
 file_existing <- function(path) {
-  if (!file.exist(path)){
+  if (!file.exists(path)) {
     cat(
       sprintf("file %s does not exist\n", path),
       file = stderr()
@@ -107,14 +111,14 @@ spec <- matrix(
     "output",   "o", 2, "character",
     "counts",   "c", 1, "character",
     "user_acc", "u", 1, "character"
-  ), 
-  byrow = TRUE, 
+  ),
+  byrow = TRUE,
   ncol = 4
 )
 
 opt <- getopt(spec)
 
-if (!is.null(opt$help)) usage()
+if (!is.null(opt$help)) usage(0)
 if (!is.null(opt$info)) exit_normal(info)
 if (!is.null(opt$version)) exit_normal("Printing function not implemented yet. Sorry.\n")
 
@@ -136,7 +140,7 @@ users_accuracy <- read.csv(opt$user_acc)
 if (!"class" %in% colnames(pixel_counts)) exit_with_error("class column in count-file missing")
 if (!"class" %in% colnames(users_accuracy)) exit_with_error("class column in user_acc-file missing")
 if (!"count" %in% colnames(pixel_counts)) exit_with_error("count column in count-file missing")
-if (!"UA" %in% colnames(user_acc)) exit_with_error("UA column in user_acc-file missing")
+if (!"UA" %in% colnames(users_accuracy)) exit_with_error("UA column in user_acc-file missing")
 
 
 # main thing ########################################################
@@ -154,46 +158,47 @@ if (nrow(table) != nrow(pixel_counts)){
 }
 
 # compute proportional area, and standard deviation of UA
-table <- table %>% 
+table <- table %>%
   mutate(area = count / sum(count)) %>%
-  mutate(stdev = sqrt(UA*(1-UA))) %>%
+  mutate(stdev = sqrt(UA * (1 - UA))) %>%
   mutate(areaXstdev = area * stdev)
 
 # number of recommended samples
-number <- (sum(table$areaXstdev)/standard_error)**2
+number <- (sum(table$areaXstdev) / opt$error)**2 %>%
+  as.integer()
 
 sprintf("Suggested sample size: %d\n", number) %>%
   cat()
 
 # compute class-wise sample size for equal and proportional allocation
-table <- table %>% 
+table <- table %>%
   mutate(equal = round(number / nrow(table))) %>%
   mutate(proportional = round(number * area)) %>%
   mutate(compromise = NA)
 
 
 # do we have enough samples in proportional allocation?
-if (min(table$proportional) < minsize){
+if (min(table$proportional) < opt$min_size) {
 
   cat("Proportional allocation yields too few samples.\n")
   cat("A compromise between equal and proportional allocation is recommended.\n")
 
   # first, assign minimum sample size to small classes
   rare <- table %>% 
-    filter(proportional < minsize) %>% 
-    mutate(compromise = minsize)
-  
+    filter(proportional < opt$min_size) %>% 
+    mutate(compromise = opt$min_size)
+
   n_rare <- sum(rare$compromise)
-  
+
   # allocate remaining samples to big classes proportionally
   big <- table %>% 
-    filter(proportional >= minsize) %>% 
-    mutate(compromise = (number-n_rare) * area)
+    filter(proportional >= opt$min_size) %>%
+    mutate(compromise = (number - n_rare) * area)
 
   table <- rbind(rare, big)
 
   # check if proportionally allocated classes are big enough
-  if (any(big$compromise < minsize)){
+  if (any(big$compromise < opt$min_size)) {
     print(table)
     exit_with_error("Compromising failed. Adjust input parameters.")
   }
@@ -208,8 +213,8 @@ table <- table %>%
 
 # write output
 write.csv(
-  table, 
-  opt$output, 
-  row.names = FALSE, 
+  table,
+  opt$output,
+  row.names = FALSE,
   quote = FALSE
 )
diff --git a/src/aux-level/_stratified_sample.c b/src/aux-level/_stratified_sample.c
new file mode 100644
index 00000000..777733ca
--- /dev/null
+++ b/src/aux-level/_stratified_sample.c
@@ -0,0 +1,230 @@
+/**+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+This file is part of FORCE - Framework for Operational Radiometric 
+Correction for Environmental monitoring.
+
+Copyright (C) 2013-2024 David Frantz
+
+FORCE is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+FORCE is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with FORCE.  If not, see <http://www.gnu.org/licenses/>.
+
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++**/
+
+/**+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+This program computes a histogram of the given image
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++**/
+
+
+#include <stdio.h>   // core input and output functions
+#include <stdlib.h>  // standard general utilities library
+
+#include <ctype.h>   // testing and mapping characters
+#include <unistd.h>  // standard symbolic constants and types 
+
+#include "../cross-level/const-cl.h"
+#include "../cross-level/konami-cl.h"
+#include "../cross-level/string-cl.h"
+
+/** Geospatial Data Abstraction Library (GDAL) **/
+#include "gdal.h"           // public (C callable) GDAL entry points
+
+
+typedef struct {
+  int n;
+  int band;
+  char file_input[NPOW_10];
+  char file_output[NPOW_10];
+} args_t;
+
+
+void usage(char *exe, int exit_code){
+
+
+  printf("Usage: %s [-h] [-v] [-i] [-b band] [-o output-file] input-image\n", exe);
+  printf("\n");
+  printf("  -h  = show this help\n");
+  printf("  -v  = show version\n");
+  printf("  -i  = show program's purpose\n");
+  printf("\n");
+  printf("  -b band = band to use,\n");
+  printf("     defaults to 1\n");
+  printf("\n");
+  printf("  -o output-file  = output file path with extension,\n");
+  printf("     defaults to './histogram.csv'\n");
+  printf("\n");  
+  printf("  Positional arguments:\n");
+  printf("  - 'input-image': image for computing the histogram\n");
+  printf("\n");
+
+  exit(exit_code);
+  return;
+}
+
+
+void parse_args(int argc, char *argv[], args_t *args){
+int opt;
+
+
+  opterr = 0;
+
+  // default parameters
+  copy_string(args->file_output,  NPOW_10, "histogram.csv");
+  args->band = 1;
+
+  // optional parameters
+  while ((opt = getopt(argc, argv, "hvio:b:")) != -1){
+    switch(opt){
+      case 'h':
+        usage(argv[0], SUCCESS);
+      case 'v':
+        printf("FORCE version: %s\n", _VERSION_);
+        exit(SUCCESS);
+      case 'i':
+        printf("Compute image histogram\n");
+        exit(SUCCESS);
+      case 'o':
+        copy_string(args->file_output, NPOW_10, optarg);
+        break;
+      case 'b':
+        args->band = atoi(optarg);
+        if (args->band < 0){
+          fprintf(stderr, "Band must be >= 1\n");
+          usage(argv[0], FAILURE);  
+        }
+        break;
+      case '?':
+        if (isprint(optopt)){
+          fprintf(stderr, "Unknown option `-%c'.\n", optopt);
+        } else {
+          fprintf(stderr, "Unknown option character `\\x%x'.\n", optopt);
+        }
+        usage(argv[0], FAILURE);
+      default:
+        fprintf(stderr, "Error parsing arguments.\n");
+        usage(argv[0], FAILURE);
+    }
+  }
+
+  // non-optional parameters
+  args->n = 1;
+
+  if (optind < argc){
+    konami_args(argv[optind]);
+    if (argc-optind == args->n){
+      copy_string(args->file_input, NPOW_10, argv[optind++]);
+    } else if (argc-optind < args->n){
+      fprintf(stderr, "some non-optional arguments are missing.\n");
+      usage(argv[0], FAILURE);
+    } else if (argc-optind > args->n){
+      fprintf(stderr, "too many non-optional arguments.\n");
+      usage(argv[0], FAILURE);
+    }
+  } else {
+    fprintf(stderr, "non-optional arguments are missing.\n");
+    usage(argv[0], FAILURE);
+  }
+
+  return;
+}
+
+
+int main(int argc, char *argv[]){
+args_t args;
+GDALDatasetH  fp;
+GDALRasterBandH band;
+int i, j, nx, ny;
+short *line = NULL;
+short nodata;
+int has_nodata;
+
+int offset = SHRT_MAX+1;
+int length = USHRT_MAX+1;
+off_t counts[length];
+FILE *fout = NULL;
+
+
+double **sample_size = NULL
+int n_class, n_column;
+
+  parse_args(argc, argv, &args);
+
+  sample_size = read_table(args.file_size, &n_class, &n_column);
+  if (n_column != 3){
+    fprintf(stderr, "input-size does not have 3 columns %s.\n", args.file_input); exit(1);}
+
+  GDALAllRegister();
+  if ((fp = GDALOpen(args.file_input, GA_ReadOnly)) == NULL){
+    fprintf(stderr, "could not open %s.\n", args.file_input); exit(1);}
+
+  nx  = GDALGetRasterXSize(fp);
+  ny  = GDALGetRasterYSize(fp);
+  
+  alloc((void**)&line, nx, sizeof(short));
+
+  band = GDALGetRasterBand(fp, args.band);
+
+  nodata = (short)GDALGetRasterNoDataValue(band, &has_nodata);
+  if (!has_nodata){
+    fprintf(stderr, "input image has no nodata value.\n"); 
+    exit(1);
+  }
+
+
+
+  memset(counts, 0, sizeof(off_t)*length);
+
+  for (i=0; i<ny; i++){
+
+    if (GDALRasterIO(band, GF_Read, 0, i, nx, 1, 
+      line, nx, 1, GDT_Int16, 0, 0) == CE_Failure){
+      fprintf(stderr, "could not read line %d.\n", i+1); exit(1);}
+
+    for (j=0; j<nx; j++){
+
+      if (line[j] == nodata) continue;
+
+      counts[line[j] + offset]++;
+
+    }
+
+  }
+
+  GDALClose(fp);
+
+  free((void*)line);
+
+
+
+    
+  if ((fout = fopen(args.file_output, "w")) == NULL){
+    fprintf(stderr, "Unable to open output file %s\n", args.file_output); 
+    return FAILURE;}
+
+  fprintf(fout, "class,count\n");
+
+  for (i=0; i<length; i++){
+
+    if (counts[i] == 0) continue;
+
+    fprintf(fout, "%d,%lu\n", i - offset , counts[i]);
+
+  }
+
+  fclose(fout);
+
+
+  free_2D((void**)sample_size, n_class);
+
+  return SUCCESS;
+}
+
diff --git a/src/cross-level/read-cl.c b/src/cross-level/read-cl.c
index 17cad46e..de598419 100755
--- a/src/cross-level/read-cl.c
+++ b/src/cross-level/read-cl.c
@@ -38,7 +38,7 @@ double **read_table(char *fname, int *nrows, int *ncols){
 FILE *fp;
 char  buffer[NPOW_16] = "\0";
 char *ptr = NULL;
-const char *separator = " \t";
+const char *separator = " ,\t";
 double **tab = NULL;
 int ni = 0;
 int nj = 0;

From 5c9b28a1ec46ae21e332a99630d78bb0f2e4dae8 Mon Sep 17 00:00:00 2001
From: David Frantz <david.frantz@uni-trier.de>
Date: Wed, 31 Jan 2024 09:11:42 +0100
Subject: [PATCH 10/23] collected vars at end

---
 rstats/force-map-accuracy.r | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/rstats/force-map-accuracy.r b/rstats/force-map-accuracy.r
index 06b5e4f3..f0e54a95 100755
--- a/rstats/force-map-accuracy.r
+++ b/rstats/force-map-accuracy.r
@@ -292,8 +292,6 @@ confusion_adjusted <-
     byrow = FALSE
   )
 
-confusion_counts
-confusion_adjusted
 
 
 # Olofsson et al. 2013, eq. 2
@@ -431,6 +429,23 @@ pa_se <-
 sqrt() %>%
 `*`(1.96)
 
+
+
+confusion_counts
+confusion_adjusted
+
+acc_traditional
+acc_adjusted
+
+cnt$area
+area_adjusted
+confidence_area_adjusted
+
+oa_se
+pa_se
+ua_se
+
+
 fo <- file(opt$output, "w")
 
 cat("# Accuracy assessment", file = fo)

From c203ffde9f5df50fa6299cbad6e93b6eddbc1453 Mon Sep 17 00:00:00 2001
From: David Frantz <david.frantz@uni-trier.de>
Date: Wed, 31 Jan 2024 13:00:58 +0100
Subject: [PATCH 11/23] skip lines when writing table

---
 src/cross-level/table-cl.c | 4 +++-
 src/cross-level/table-cl.h | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/cross-level/table-cl.c b/src/cross-level/table-cl.c
index 74d49265..cad59b02 100755
--- a/src/cross-level/table-cl.c
+++ b/src/cross-level/table-cl.c
@@ -347,7 +347,7 @@ int width, *max_width = NULL;
 --- separator: column separator
 +++ Return:    void
 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++**/
-void write_table(table_t *table, char *fname, char *separator){
+void write_table(table_t *table, char *fname, char *separator, bool skip_rows){
 int row, col;
 FILE *fp = NULL;
 
@@ -369,6 +369,8 @@ FILE *fp = NULL;
 
   for (row=0; row<table->nrow; row++){
 
+    if (skip_rows && !table->row_mask[row]) continue;
+
     if (table->has_row_names) printf("%s%s", table->row_names[row], separator);
     for (col=0; col<(table->ncol-1); col++) printf("%.2f%s", table->data[row][col], separator);
     printf("%.2f\n", table->data[row][col]);
diff --git a/src/cross-level/table-cl.h b/src/cross-level/table-cl.h
index f7340278..a349acfa 100755
--- a/src/cross-level/table-cl.h
+++ b/src/cross-level/table-cl.h
@@ -63,7 +63,7 @@ table_t read_table(char *fname, bool has_row_names, bool has_col_names);
 table_t allocate_table(int nrow, int ncol, bool has_row_names, bool has_col_names);
 void init_table(table_t *table);
 void print_table(table_t *table, bool truncate);
-void write_table(table_t *table, char *fname, char *separator);
+void write_table(table_t *table, char *fname, char *separator, bool skip_rows);
 void free_table(table_t *table);
 
 #ifdef __cplusplus

From cd7830d81e23976a4f32848fec0a79f0c6a12830 Mon Sep 17 00:00:00 2001
From: David Frantz <david.frantz@uni-trier.de>
Date: Wed, 31 Jan 2024 13:18:42 +0100
Subject: [PATCH 12/23] used gerneric table in force-hist

---
 src/aux-level/_hist.c | 35 ++++++++++++++---------------------
 1 file changed, 14 insertions(+), 21 deletions(-)

diff --git a/src/aux-level/_hist.c b/src/aux-level/_hist.c
index 61c9e00b..08e64631 100644
--- a/src/aux-level/_hist.c
+++ b/src/aux-level/_hist.c
@@ -34,6 +34,7 @@ This program computes a histogram of the given image
 #include "../cross-level/const-cl.h"
 #include "../cross-level/konami-cl.h"
 #include "../cross-level/string-cl.h"
+#include "../cross-level/table-cl.h"
 
 /** Geospatial Data Abstraction Library (GDAL) **/
 #include "gdal.h"           // public (C callable) GDAL entry points
@@ -146,11 +147,10 @@ int i, j, nx, ny;
 short *line = NULL;
 short nodata;
 int has_nodata;
-
 int offset = SHRT_MAX+1;
 int length = USHRT_MAX+1;
-off_t counts[length];
-FILE *fout = NULL;
+table_t counts;
+int row;
 
 
   parse_args(argc, argv, &args);
@@ -173,8 +173,10 @@ FILE *fout = NULL;
   }
 
 
-
-  memset(counts, 0, sizeof(off_t)*length);
+  counts = allocate_table(length, 2, false, true);
+  copy_string(counts.col_names[0], NPOW_10, "class");
+  copy_string(counts.col_names[1], NPOW_10, "count");
+  for (row=0; row<counts.nrow; row++) counts.data[row][0] = row - offset;
 
   for (i=0; i<ny; i++){
 
@@ -186,7 +188,8 @@ FILE *fout = NULL;
 
       if (line[j] == nodata) continue;
 
-      counts[line[j] + offset]++;
+      row = line[j] + offset;
+      counts.data[row][1]++;
 
     }
 
@@ -197,23 +200,13 @@ FILE *fout = NULL;
   free((void*)line);
 
 
-
-    
-  if ((fout = fopen(args.file_output, "w")) == NULL){
-    fprintf(stderr, "Unable to open output file %s\n", args.file_output); 
-    return FAILURE;}
-
-  fprintf(fout, "class,count\n");
-
-  for (i=0; i<length; i++){
-
-    if (counts[i] == 0) continue;
-
-    fprintf(fout, "%d,%lu\n", i - offset , counts[i]);
-
+  for (row=0; row<counts.nrow; row++){
+    if (counts.data[row][1] == 0) counts.row_mask[row] = false;
   }
 
-  fclose(fout);
+  write_table(&counts, args.file_output, ",", true);
+
+  free_table(&counts);
 
   return SUCCESS;
 }

From 9b90e9a7790818ad11deb554a7062ebbb92045d7 Mon Sep 17 00:00:00 2001
From: David Frantz <david.frantz@uni-trier.de>
Date: Wed, 31 Jan 2024 15:34:21 +0100
Subject: [PATCH 13/23] added new program force-stratified-sample

---
 Makefile                                      |  7 ++-
 ...ratified_sample.c => _stratified-sample.c} | 52 ++++++-------------
 2 files changed, 21 insertions(+), 38 deletions(-)
 rename src/aux-level/{_stratified_sample.c => _stratified-sample.c} (82%)

diff --git a/Makefile b/Makefile
index 6c1ad4d9..83b2fcb8 100755
--- a/Makefile
+++ b/Makefile
@@ -78,7 +78,7 @@ FORCE_EXE = force force-cube force-higher-level force-import-modis \
             force-procmask force-pyramid force-qai-inflate force-stack \
             force-synthmix force-tabulate-grid force-tile-extent \
             force-tile-finder force-train force-level2-report force-cube-init \
-			force-init force-datacube-size force-hist
+			force-init force-datacube-size force-hist force-stratified-sample
 
 FORCE_MISC = force-level2-report.Rmd
 
@@ -118,7 +118,7 @@ cross: string_cl enum_cl cite_cl utils_cl alloc_cl brick_cl imagefuns_cl param_c
 lower: table_ll param_ll meta_ll cube_ll equi7_ll glance7_ll atc_ll sunview_ll read_ll radtran_ll topo_ll cloud_ll gas_ll brdf_ll atmo_ll aod_ll resmerge_ll coreg_ll coregfuns_ll acix_ll modwvp_ll
 higher: param_hl progress_hl tasks_hl read-aux_hl read-ard_hl quality_hl bap_hl level3_hl cso_hl tsa_hl index_hl interpolate_hl stm_hl fold_hl standardize_hl pheno_hl polar_hl trend_hl ml_hl texture_hl lsm_hl lib_hl sample_hl imp_hl cfimp_hl l2imp_hl spec-adjust_hl pyp_hl rsp_hl udf_hl
 aux: param_aux param_train_aux train_aux
-exe: force force-parameter force-qai-inflate force-tile-finder force-tabulate-grid force-l2ps force-higher-level force-train force-lut-modis force-mdcp force-stack force-import-modis force-cube-init force-hist
+exe: force force-parameter force-qai-inflate force-tile-finder force-tabulate-grid force-l2ps force-higher-level force-train force-lut-modis force-mdcp force-stack force-import-modis force-cube-init force-hist force-stratified-sample
 .PHONY: temp all install install_ bash python external clean build check
 
 ### TEMP
@@ -427,6 +427,9 @@ force-cube-init: temp cross lower  $(DA)/_init-cube.c
 force-hist: temp cross $(DA)/_hist.c
 	$(G11) $(CFLAGS) $(GDAL) $(GSL) $(CURL) -o $(TB)/force-hist $(DA)/_hist.c $(TC)/*.o $(TL)/*.o $(LDGDAL) $(LDGSL) $(LDCURL)
 
+force-stratified-sample: temp cross $(DA)/_stratified-sample.c
+	$(G11) $(CFLAGS) $(GDAL) $(GSL) $(CURL) -o $(TB)/force-stratified-sample $(DA)/_stratified-sample.c $(TC)/*.o $(TL)/*.o $(LDGDAL) $(LDGSL) $(LDCURL)
+
 ### dummy code for testing stuff  
 
 dummy: temp cross aux higher src/dummy.c
diff --git a/src/aux-level/_stratified_sample.c b/src/aux-level/_stratified-sample.c
similarity index 82%
rename from src/aux-level/_stratified_sample.c
rename to src/aux-level/_stratified-sample.c
index 777733ca..40c847ba 100644
--- a/src/aux-level/_stratified_sample.c
+++ b/src/aux-level/_stratified-sample.c
@@ -34,6 +34,7 @@ This program computes a histogram of the given image
 #include "../cross-level/const-cl.h"
 #include "../cross-level/konami-cl.h"
 #include "../cross-level/string-cl.h"
+#include "../cross-level/table-cl.h"
 
 /** Geospatial Data Abstraction Library (GDAL) **/
 #include "gdal.h"           // public (C callable) GDAL entry points
@@ -42,8 +43,10 @@ This program computes a histogram of the given image
 typedef struct {
   int n;
   int band;
-  char file_input[NPOW_10];
+  char file_input_image[NPOW_10];
+  char file_input_sample_size[NPOW_10];
   char file_output[NPOW_10];
+  char column_sample_size[NPOW_10];
 } args_t;
 
 
@@ -116,12 +119,14 @@ int opt;
   }
 
   // non-optional parameters
-  args->n = 1;
+  args->n = 3;
 
   if (optind < argc){
     konami_args(argv[optind]);
     if (argc-optind == args->n){
-      copy_string(args->file_input, NPOW_10, argv[optind++]);
+      copy_string(args->file_input_image, NPOW_10, argv[optind++]);
+      copy_string(args->file_input_sample_size, NPOW_10, argv[optind++]);
+      copy_string(args->column_sample_size, NPOW_10, argv[optind++]);
     } else if (argc-optind < args->n){
       fprintf(stderr, "some non-optional arguments are missing.\n");
       usage(argv[0], FAILURE);
@@ -147,24 +152,20 @@ short *line = NULL;
 short nodata;
 int has_nodata;
 
-int offset = SHRT_MAX+1;
-int length = USHRT_MAX+1;
-off_t counts[length];
-FILE *fout = NULL;
 
+table_t sample_size;
 
-double **sample_size = NULL
-int n_class, n_column;
 
   parse_args(argc, argv, &args);
 
-  sample_size = read_table(args.file_size, &n_class, &n_column);
-  if (n_column != 3){
-    fprintf(stderr, "input-size does not have 3 columns %s.\n", args.file_input); exit(1);}
+  sample_size = read_table(args.file_input_sample_size, false, true);
+  print_table(&sample_size, false);
+//  if (n_column != 3){
+//    fprintf(stderr, "input-size does not have 3 columns %s.\n", args.file_input); exit(1);}
 
   GDALAllRegister();
-  if ((fp = GDALOpen(args.file_input, GA_ReadOnly)) == NULL){
-    fprintf(stderr, "could not open %s.\n", args.file_input); exit(1);}
+  if ((fp = GDALOpen(args.file_input_image, GA_ReadOnly)) == NULL){
+    fprintf(stderr, "could not open %s.\n", args.file_input_image); exit(1);}
 
   nx  = GDALGetRasterXSize(fp);
   ny  = GDALGetRasterYSize(fp);
@@ -180,9 +181,6 @@ int n_class, n_column;
   }
 
 
-
-  memset(counts, 0, sizeof(off_t)*length);
-
   for (i=0; i<ny; i++){
 
     if (GDALRasterIO(band, GF_Read, 0, i, nx, 1, 
@@ -193,8 +191,6 @@ int n_class, n_column;
 
       if (line[j] == nodata) continue;
 
-      counts[line[j] + offset]++;
-
     }
 
   }
@@ -205,25 +201,9 @@ int n_class, n_column;
 
 
 
-    
-  if ((fout = fopen(args.file_output, "w")) == NULL){
-    fprintf(stderr, "Unable to open output file %s\n", args.file_output); 
-    return FAILURE;}
-
-  fprintf(fout, "class,count\n");
-
-  for (i=0; i<length; i++){
-
-    if (counts[i] == 0) continue;
-
-    fprintf(fout, "%d,%lu\n", i - offset , counts[i]);
-
-  }
-
-  fclose(fout);
 
 
-  free_2D((void**)sample_size, n_class);
+  free_table(&sample_size);
 
   return SUCCESS;
 }

From 497d17f1977a07df149fce41ac576979031f7fd4 Mon Sep 17 00:00:00 2001
From: David Frantz <david.frantz@uni-trier.de>
Date: Wed, 31 Jan 2024 16:30:54 +0100
Subject: [PATCH 14/23] find correct columns

---
 src/aux-level/_stratified-sample.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/aux-level/_stratified-sample.c b/src/aux-level/_stratified-sample.c
index 40c847ba..285375a4 100644
--- a/src/aux-level/_stratified-sample.c
+++ b/src/aux-level/_stratified-sample.c
@@ -151,15 +151,26 @@ int i, j, nx, ny;
 short *line = NULL;
 short nodata;
 int has_nodata;
-
-
 table_t sample_size;
+int col_size, col_class;
 
 
   parse_args(argc, argv, &args);
 
   sample_size = read_table(args.file_input_sample_size, false, true);
+  if ((col_size = find_table_col(&sample_size, args.column_sample_size)) < 0){
+    printf("could not find column name in file-sample-size\n"); exit(FAILURE);}
+  if ((col_class = find_table_col(&sample_size, "class")) < 0){
+    printf("could not find column name in file-sample-size\n"); exit(FAILURE);}
+
   print_table(&sample_size, false);
+  printf("column %s in column %d\n", "class", col_class);
+  printf("column %s in column %d\n", args.column_sample_size, col_size);
+
+  print("min/max class: %d/%d\n", table.min[col_class], table.max[col_class]);
+
+  
+
 //  if (n_column != 3){
 //    fprintf(stderr, "input-size does not have 3 columns %s.\n", args.file_input); exit(1);}
 

From ecf8d82f164d3e64feb4bd906637e97d94190880 Mon Sep 17 00:00:00 2001
From: David Frantz <david.frantz@uni-trier.de>
Date: Wed, 31 Jan 2024 16:51:03 +0100
Subject: [PATCH 15/23] dictionary for faster indexing

---
 src/aux-level/_stratified-sample.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/aux-level/_stratified-sample.c b/src/aux-level/_stratified-sample.c
index 285375a4..698e62ef 100644
--- a/src/aux-level/_stratified-sample.c
+++ b/src/aux-level/_stratified-sample.c
@@ -154,6 +154,9 @@ int has_nodata;
 table_t sample_size;
 int col_size, col_class;
 
+int offset = SHRT_MAX+1;
+int length = USHRT_MAX+1;
+enum { _CLASS_, _SIZE_, _PROBABILITY_, _DICT_LENGTH_ };
 
   parse_args(argc, argv, &args);
 
@@ -167,12 +170,22 @@ int col_size, col_class;
   printf("column %s in column %d\n", "class", col_class);
   printf("column %s in column %d\n", args.column_sample_size, col_size);
 
-  print("min/max class: %d/%d\n", table.min[col_class], table.max[col_class]);
+  printf("min/max class: %d/%d\n", (int)sample_size.min[col_class], (int)sample_size.max[col_class]);
+
+
+
+  alloc_2D((void**)&dictionary, length, _DICT_LENGTH_, sizeof(double));
+
+  for (row=0; row<sample_size.nrow; row++){
+    dictionary[(int)sample_size[row][col_class] + offset][_CLASS_] = sample_size.data[row][col_class];
+    dictionary[(int)sample_size[row][col_class] + offset][_SIZE_] = sample_size.data[row][col_size];
+    dictionary[(int)sample_size[row][col_class] + offset][_PROBABILITY_] = sample_size.data[row][col_size] / sample_size.sum[col_size];
+  }
+
+
+
 
-  
 
-//  if (n_column != 3){
-//    fprintf(stderr, "input-size does not have 3 columns %s.\n", args.file_input); exit(1);}
 
   GDALAllRegister();
   if ((fp = GDALOpen(args.file_input_image, GA_ReadOnly)) == NULL){
@@ -215,6 +228,7 @@ int col_size, col_class;
 
 
   free_table(&sample_size);
+  free_2D((void**)dictionary, length);
 
   return SUCCESS;
 }

From 4c9c2672d53616c532096c536cec7fd8c486b9f3 Mon Sep 17 00:00:00 2001
From: David Frantz <david.frantz@uni-trier.de>
Date: Wed, 31 Jan 2024 20:54:03 +0100
Subject: [PATCH 16/23] random number generator pretty close to target

---
 src/aux-level/_stratified-sample.c | 68 +++++++++++++++++++++++++-----
 1 file changed, 57 insertions(+), 11 deletions(-)

diff --git a/src/aux-level/_stratified-sample.c b/src/aux-level/_stratified-sample.c
index 698e62ef..34f01b3f 100644
--- a/src/aux-level/_stratified-sample.c
+++ b/src/aux-level/_stratified-sample.c
@@ -31,6 +31,8 @@ This program computes a histogram of the given image
 #include <ctype.h>   // testing and mapping characters
 #include <unistd.h>  // standard symbolic constants and types 
 
+#include <time.h>
+
 #include "../cross-level/const-cl.h"
 #include "../cross-level/konami-cl.h"
 #include "../cross-level/string-cl.h"
@@ -152,39 +154,52 @@ short *line = NULL;
 short nodata;
 int has_nodata;
 table_t sample_size;
-int col_size, col_class;
+int col_size, col_class, col_count, row;
 
 int offset = SHRT_MAX+1;
 int length = USHRT_MAX+1;
-enum { _CLASS_, _SIZE_, _PROBABILITY_, _DICT_LENGTH_ };
+double **dictionary = NULL;
+enum { _CLASS_, _SIZE_, _PROBABILITY_, _COUNT_, _DICT_LENGTH_ };
 
   parse_args(argc, argv, &args);
 
   sample_size = read_table(args.file_input_sample_size, false, true);
   if ((col_size = find_table_col(&sample_size, args.column_sample_size)) < 0){
-    printf("could not find column name in file-sample-size\n"); exit(FAILURE);}
+    printf("could not find column name %s in file-sample-size\n", args.column_sample_size); exit(FAILURE);}
   if ((col_class = find_table_col(&sample_size, "class")) < 0){
-    printf("could not find column name in file-sample-size\n"); exit(FAILURE);}
+    printf("could not find column name %s in file-sample-size\n", "class"); exit(FAILURE);}
+  if ((col_count = find_table_col(&sample_size, "count")) < 0){
+    printf("could not find column name %s in file-sample-size\n", "count"); exit(FAILURE);}
 
   print_table(&sample_size, false);
   printf("column %s in column %d\n", "class", col_class);
+  printf("column %s in column %d\n", "count", col_count);
   printf("column %s in column %d\n", args.column_sample_size, col_size);
 
   printf("min/max class: %d/%d\n", (int)sample_size.min[col_class], (int)sample_size.max[col_class]);
 
 
 
-  alloc_2D((void**)&dictionary, length, _DICT_LENGTH_, sizeof(double));
+  alloc_2D((void***)&dictionary, length, _DICT_LENGTH_, sizeof(double));
 
   for (row=0; row<sample_size.nrow; row++){
-    dictionary[(int)sample_size[row][col_class] + offset][_CLASS_] = sample_size.data[row][col_class];
-    dictionary[(int)sample_size[row][col_class] + offset][_SIZE_] = sample_size.data[row][col_size];
-    dictionary[(int)sample_size[row][col_class] + offset][_PROBABILITY_] = sample_size.data[row][col_size] / sample_size.sum[col_size];
+    dictionary[(int)sample_size.data[row][col_class] + offset][_CLASS_] = sample_size.data[row][col_class];
+    dictionary[(int)sample_size.data[row][col_class] + offset][_SIZE_] = sample_size.data[row][col_size];
+    dictionary[(int)sample_size.data[row][col_class] + offset][_PROBABILITY_] = sample_size.data[row][col_size] / sample_size.data[row][col_count];
+
+    printf("%.2f %.2f %.12f\n", 
+      dictionary[(int)sample_size.data[row][col_class] + offset][_CLASS_],
+      dictionary[(int)sample_size.data[row][col_class] + offset][_SIZE_],
+      dictionary[(int)sample_size.data[row][col_class] + offset][_PROBABILITY_]);
   }
 
 
-
-
+  srand(time(NULL)); 
+  printf("random number: %f\n", (double) (rand() % (size_t) sample_size.sum[col_count])/sample_size.sum[col_count]);
+  printf("random number: %f\n", (double) (rand() % (size_t) sample_size.sum[col_count])/sample_size.sum[col_count]);
+  printf("random number: %f\n", (double) (rand() % (size_t) sample_size.sum[col_count])/sample_size.sum[col_count]);
+  printf("random number: %f\n", (double) (rand() % (size_t) sample_size.sum[col_count])/sample_size.sum[col_count]);
+  printf("random number: %f\n", (double) (rand() % (size_t) sample_size.sum[col_count])/sample_size.sum[col_count]);
 
 
   GDALAllRegister();
@@ -204,7 +219,17 @@ enum { _CLASS_, _SIZE_, _PROBABILITY_, _DICT_LENGTH_ };
     exit(1);
   }
 
-
+ int cnt = 0;
+double probability;
+ /** FILE *fp_ = NULL;
+  fp_ = fopen("test.txt", "w");
+  for (i=0; i<346589861; i++){
+    probability = (double)rand()/(double)RAND_MAX;
+    fprintf(fp_, "%f\n", probability);
+  }
+  fclose(fp_);
+  exit(0);
+ **/
   for (i=0; i<ny; i++){
 
     if (GDALRasterIO(band, GF_Read, 0, i, nx, 1, 
@@ -215,10 +240,31 @@ enum { _CLASS_, _SIZE_, _PROBABILITY_, _DICT_LENGTH_ };
 
       if (line[j] == nodata) continue;
 
+      //probability = (rand() % (size_t) sample_size.sum[col_count])/sample_size.sum[col_count]*100;
+      probability = (double)rand()/(double)RAND_MAX;
+
+      //printf("probability: %f\n", probability);
+
+      if (dictionary[line[j] + offset][_PROBABILITY_] >= probability){
+        dictionary[line[j] + offset][_COUNT_]++;
+        printf("%d samples selected (row %d, col %d)\n", cnt++, i, j);
+      }
+        
+
     }
 
   }
 
+
+ for (row=0; row<sample_size.nrow; row++){
+    printf("%.2f, %.2f, %.12f, %.0f,\n", 
+      dictionary[(int)sample_size.data[row][col_class] + offset][_CLASS_],
+      dictionary[(int)sample_size.data[row][col_class] + offset][_SIZE_],
+      dictionary[(int)sample_size.data[row][col_class] + offset][_PROBABILITY_],
+      dictionary[(int)sample_size.data[row][col_class] + offset][_COUNT_]);
+  }
+
+
   GDALClose(fp);
 
   free((void*)line);

From 17a6382f7c53983c77c6ed95efd109bbe15bc834 Mon Sep 17 00:00:00 2001
From: David Frantz <david.frantz@uni-trier.de>
Date: Fri, 2 Feb 2024 15:34:43 +0100
Subject: [PATCH 17/23] option to skip rows when printing table

---
 src/cross-level/table-cl.c | 4 +++-
 src/cross-level/table-cl.h | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/cross-level/table-cl.c b/src/cross-level/table-cl.c
index 0a821e34..6b7b387d 100755
--- a/src/cross-level/table-cl.c
+++ b/src/cross-level/table-cl.c
@@ -338,7 +338,7 @@ int row;
 --- table:  table
 +++ Return: void
 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++**/
-void print_table(table_t *table, bool truncate){
+void print_table(table_t *table, bool truncate, bool skip_rows){
 int row, col;
 int nrow_print = 6, ncol_print = 6;
 int width, *max_width = NULL;
@@ -383,6 +383,8 @@ int width, *max_width = NULL;
 
   for (row=0; row<nrow_print; row++){
 
+    if (skip_rows && !table->row_mask[row]) continue;
+
     if (table->has_row_names) printf("%*s ", max_width[0], table->row_names[row]);
     for (col=0; col<ncol_print; col++) printf("%+*.2f ", max_width[col+1], table->data[row][col]);
     if (truncate && col < table->ncol) printf("...");
diff --git a/src/cross-level/table-cl.h b/src/cross-level/table-cl.h
index da7fe84d..60efc4bb 100755
--- a/src/cross-level/table-cl.h
+++ b/src/cross-level/table-cl.h
@@ -67,7 +67,7 @@ table_t allocate_table(int nrow, int ncol, bool has_row_names, bool has_col_name
 int find_table_col(table_t *table, const char *name);
 int find_table_row(table_t *table, const char *name);
 void init_table(table_t *table);
-void print_table(table_t *table, bool truncate);
+void print_table(table_t *table, bool truncate, bool skip_rows);
 void write_table(table_t *table, char *fname, const char *separator, bool skip_rows);
 void free_table(table_t *table);
 

From 6e9b29ed8553df30438ed5b3a60f187253c9db7b Mon Sep 17 00:00:00 2001
From: David Frantz <david.frantz@uni-trier.de>
Date: Fri, 2 Feb 2024 15:35:20 +0100
Subject: [PATCH 18/23] fixed band request

---
 src/aux-level/_hist.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/aux-level/_hist.c b/src/aux-level/_hist.c
index 08e64631..899c4368 100644
--- a/src/aux-level/_hist.c
+++ b/src/aux-level/_hist.c
@@ -98,7 +98,7 @@ int opt;
         break;
       case 'b':
         args->band = atoi(optarg);
-        if (args->band < 0){
+        if (args->band < 1){
           fprintf(stderr, "Band must be >= 1\n");
           usage(argv[0], FAILURE);  
         }
@@ -143,7 +143,7 @@ int main(int argc, char *argv[]){
 args_t args;
 GDALDatasetH  fp;
 GDALRasterBandH band;
-int i, j, nx, ny;
+int i, j, nx, ny, nbands;
 short *line = NULL;
 short nodata;
 int has_nodata;
@@ -159,11 +159,14 @@ int row;
   if ((fp = GDALOpen(args.file_input, GA_ReadOnly)) == NULL){
     fprintf(stderr, "could not open %s.\n", args.file_input); exit(1);}
 
-  nx  = GDALGetRasterXSize(fp);
-  ny  = GDALGetRasterYSize(fp);
-  
+  nx     = GDALGetRasterXSize(fp);
+  ny     = GDALGetRasterYSize(fp);
+  nbands = GDALGetRasterCount(fp);
+
   alloc((void**)&line, nx, sizeof(short));
 
+  if (args.band > nbands){
+    fprintf(stderr, "Input image has %d band(s), band %d was requested.\n", nbands, args.band); exit(1);}
   band = GDALGetRasterBand(fp, args.band);
 
   nodata = (short)GDALGetRasterNoDataValue(band, &has_nodata);

From b9ea0a5f22006d243a46b1986e9ffe9a75338528 Mon Sep 17 00:00:00 2001
From: David Frantz <david.frantz@uni-trier.de>
Date: Fri, 2 Feb 2024 15:35:37 +0100
Subject: [PATCH 19/23] usage format

---
 rstats/force-sample-size.r | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/rstats/force-sample-size.r b/rstats/force-sample-size.r
index 3ede6d5f..ef8cbfe1 100755
--- a/rstats/force-sample-size.r
+++ b/rstats/force-sample-size.r
@@ -47,15 +47,11 @@ usage <- function(exit){
     "  -i  = show program's purpose\n",
     "\n",
     "  -e error = standard error of the estimated overall accuracy (default: 0.01)\n",
-    "\n",
     "  -s min_size = minimum sample size per class (default: 50)\n",
-    "\n",
     "  -o output-file  = output file path with extension,\n",
     "     defaults to './sample-size.csv'\n",
-    "\n",
     "  -c count-file  = csv table with pixel counts per class\n",
     "     2 columns named class and count",
-    "\n",
     "  -u user_acc-file  = csv table with expected user accuracy per class\n",
     "     2 columns named class and UA",
     "\n"

From 611c0ec841ee55f9f045e7a439c31fa6191dda70 Mon Sep 17 00:00:00 2001
From: David Frantz <david.frantz@uni-trier.de>
Date: Fri, 2 Feb 2024 15:36:14 +0100
Subject: [PATCH 20/23] finalized map_accuracy script, commenting still to do

---
 rstats/force-map-accuracy.r | 210 +++++++++++++++++-------------------
 1 file changed, 101 insertions(+), 109 deletions(-)

diff --git a/rstats/force-map-accuracy.r b/rstats/force-map-accuracy.r
index f0e54a95..238d9915 100755
--- a/rstats/force-map-accuracy.r
+++ b/rstats/force-map-accuracy.r
@@ -22,9 +22,16 @@ info <- "Compute map accuracy and area statistics.\n"
 
 # load libraries ####################################################
 library(dplyr)
+library(sf)
 library(getopt)
 
 
+# program name ######################################################
+progname <<-
+  get_Rscript_filename() %>% 
+  basename()
+
+
 # input #############################################################
 
 # usage function in same style as other FORCE tools,
@@ -33,10 +40,10 @@ usage <- function(exit){
 
   message <- c(
     sprintf(
-      "Usage: %s [-h] [-v] [-i] [-o output-file] -c count-file \n", 
-      get_Rscript_filename()
+      "Usage: %s [-h] [-v] [-i] [-o output-file] \n",
+      progname
     ),
-    "       -m map-file -r reference-file -a pixel-area\n",
+    "       -c count-file -s sample-file -a pixel-area\n",
     "\n",
     "  -h  = show this help\n",
     "  -v  = show version\n",
@@ -44,21 +51,14 @@ usage <- function(exit){
     "\n",
     "  -o output-file  = output file path with extension,\n",
     "     defaults to './accuracy-assessment.txt'\n",
-    "\n",
     "  -c count-file  = csv table with pixel counts per class\n",
-    "     2 columns named class and count",
-    "\n",
-    "  -m map-file  = csv table with predicted class labels\n",
-    "     2 columns named ID and map",
-    "\n",
-    "  -r reference-file  = csv table with reference class labels\n",
-    "     2 columns named ID and reference",
-    "\n",
+    "     2 columns named class and count\n",
+    "  -s sample-file = vector file with predicted and reference class labels\n",
+    "     2 columns named label_map and label_reference\n",
     "  -a pixel-area  = area of one pixel in desired reporting unit, e.g.\n",
     "      100 for a Sentinel-2 based map to be reported in m², or\n",
     "     0.01 for a Sentinel-2 based map to be reported in ha\n",
-    "\n",
-
+    "\n"
   )
 
   cat(
@@ -92,7 +92,7 @@ exit_with_error <- function(argument) {
 }
 
 file_existing <- function(path) {
-  if (!file.exist(path)){
+  if (!file.exists(path)) {
     cat(
       sprintf("file %s does not exist\n", path),
       file = stderr()
@@ -108,11 +108,10 @@ spec <- matrix(
     "info",       "i", 0, "logical",
     "output",     "o", 2, "character",
     "counts",     "c", 1, "character",
-    "map",        "m", 1, "character",
-    "reference",  "r", 1, "character",
+    "sample",     "s", 1, "character",
     "pixel_area", "a", 1, "numeric"
-  ), 
-  byrow = TRUE, 
+  ),
+  byrow = TRUE,
   ncol = 4
 )
 
@@ -120,55 +119,36 @@ opt <- getopt(spec)
 
 if (!is.null(opt$help)) usage()
 if (!is.null(opt$info)) exit_normal(info)
-if (!is.null(opt$version)) exit_normal("Printing function not implemented yet. Sorry.\n")
+if (!is.null(opt$version)) exit_normal("Printing function not implemented yet. Sorry.")
 
 if (is.null(opt$counts)) exit_with_error("count-file is missing!")
-if (is.null(opt$map)) exit_with_error("map-file is missing!")
-if (is.null(opt$reference)) exit_with_error("reference-file is missing!")
-if (is.null(opt$area)) exit_with_error("pixel-area is missing!")
+if (is.null(opt$sample)) exit_with_error("sample-file is missing!")
 
 file_existing(opt$counts)
-file_existing(opt$map)
-file_existing(opt$reference)
+file_existing(opt$sample)
 
 if (is.null(opt$output)) opt$output <- "accuracy-assessment.txt"
 
 
-cnt <- read.csv(opt$counts)
-map <- read.csv(opt$map)
-ref <- read.csv(opt$reference)
+# read data
+count <- read.csv(opt$counts)
+sample <- read_sf(opt$sample)
+
 
 # columns OK?
-if (!"ID" %in% colnames(map)) exit_with_error("ID column in map-file missing")
-if (!"ID" %in% colnames(ref)) exit_with_error("ID column in reference-file missing")
-if (!"map" %in% colnames(map)) exit_with_error("map column in map-file missing")
-if (!"reference" %in% colnames(ref)) exit_with_error("reference column in reference-file missing")
-if (!"class" %in% colnames(cnt)) exit_with_error("class column in count-file missing")
-if (!"count" %in% colnames(cnt)) exit_with_error("count column in count-file missing")
+if (!"label_map" %in% colnames(sample)) exit_with_error("label_map column in sample-file missing")
+if (!"label_reference" %in% colnames(sample)) exit_with_error("label_reference column in sample-file missing")
+if (!"class" %in% colnames(count)) exit_with_error("class column in count-file missing")
+if (!"count" %in% colnames(count)) exit_with_error("count column in count-file missing")
 
 
 # main thing ########################################################
 
-# join input tables
-table <- 
-  reference %>% 
-  inner_join(
-    map,
-    by = "ID"
-  )
-
-# join worked?
-if (nrow(table) != nrow(reference)){
-  exit_with_error("map and reference files could not be joined")
-}
-
-table <- data.frame(ID = 1:1000, map = round(runif(1000, 8, 12)), reference = round(runif(1000, 8, 12)))
-
 # get all classes
-classes <- 
+classes <-
   c(
-    table$map, 
-    table$reference
+    sample$label_map,
+    sample$label_reference
   ) %>%
   unique() %>%
   sort()
@@ -176,13 +156,13 @@ n_classes <- length(classes)
 
 
 # initialize confusion matrix
-confusion_counts <- 
+confusion_counts <-
   matrix(
-    NA, 
-    n_classes, 
-    n_classes, 
+    NA,
+    n_classes,
+    n_classes,
     dimnames = list(
-      map = classes, 
+      map = classes,
       reference = classes
     )
   )
@@ -191,33 +171,17 @@ confusion_counts <-
 for (m in 1:n_classes){
   for (r in 1:n_classes){
 
-    confusion_counts[m, r] <- 
-      table %>%
+    confusion_counts[m, r] <-
+      sample %>%
       filter(
-        map == classes[m] & 
-        reference == classes[r]
+        label_map == classes[m] &
+        label_reference == classes[r]
       ) %>%
       nrow()
 
   }
 }
 
-classes <- 1:4
-n_classes <- 4
-confusion_counts <- matrix(
-  c(
-    150,	12,	1,	2,
-    32,	100,	21,	3,
-    0,	32,	120,	0,
-    0,	0,	5,	130
-  ), byrow = TRUE,
-  n_classes, 
-    n_classes, 
-    dimnames = list(
-      map = classes, 
-      reference = classes
-    )
-)
 
 acc_metrics <- function(confusion_matrix) {
 
@@ -261,19 +225,15 @@ acc_metrics <- function(confusion_matrix) {
 }
 
 
-
-cnt <- data.frame(class = 1:4, count = c(20000,200000,300000,350000))
-opt <- list(pixel_area = 30^2/10000, output = "accuracy-assessment.txt") # ha
-
 # compute propoertional area per class, area in reporting unit, and sort the classes
-cnt <- 
-  cnt %>% 
+count <- 
+  count %>% 
   arrange(class) %>%
   mutate(weight = count / sum(count)) %>%
   mutate(area = count * opt$pixel_area)
 
 # classes should now align with map/reference dataset
-if (!any(cnt$class == classes))
+if (!any(count$class == classes))
   exit_with_error("classes in file-count do not match with map or reference classes")
 
 # Olofsson et al. 2013, eq. 1
@@ -286,7 +246,7 @@ confusion_adjusted <-
     byrow = FALSE
   ) *
   matrix(
-    cnt$weight,
+    count$weight,
     n_classes,
     n_classes,
     byrow = FALSE
@@ -296,7 +256,7 @@ confusion_adjusted <-
 
 # Olofsson et al. 2013, eq. 2
 area_adjusted <-
-  colSums(confusion_adjusted) * sum(cnt$area)
+  colSums(confusion_adjusted) * sum(count$area)
 
 
 
@@ -322,7 +282,7 @@ matrix(
   byrow = FALSE
 ) *
 matrix(
-  cnt$weight**2,
+  count$weight**2,
   n_classes,
   n_classes,
   byrow = FALSE
@@ -330,9 +290,8 @@ matrix(
  } %>%
   colSums() %>%
   sqrt() %>%
-  `*`(sum(cnt$area)) %>%
+  `*`(sum(count$area)) %>%
   `*`(1.96)
-confidence_area_adjusted
 
 
 # Olofsson et al. 2013, eq. 6-8
@@ -340,7 +299,7 @@ acc_traditional <- acc_metrics(confusion_counts)
 acc_adjusted    <- acc_metrics(confusion_adjusted)
 
 # Olofsson et al. 2014, eq. 5
-oa_se <- sum(cnt$weight**2 * acc_adjusted$ua * (1 - acc_adjusted$ua) / (rowSums(confusion_counts) - 1)) %>%
+oa_se <- sum(count$weight**2 * acc_adjusted$ua * (1 - acc_adjusted$ua) / (rowSums(confusion_counts) - 1)) %>%
 sqrt() %>%
 `*`(1.96)
 
@@ -355,7 +314,7 @@ sqrt() %>%
 Nj <-
   {
   matrix(
-    cnt$count,
+    count$count,
     n_classes,
     n_classes,
     byrow = FALSE
@@ -371,7 +330,7 @@ Nj <-
   colSums()
 
 
-term1 <- cnt$count**2 *
+term1 <- count$count**2 *
 (1 - acc_adjusted$pa)**2 * 
 acc_adjusted$ua * 
 (1 - acc_adjusted$ua) / 
@@ -386,7 +345,7 @@ diag(leave_class_out) <- 0
 
 term2 <- {
 matrix(
-  cnt$count**2,
+  count$count**2,
   n_classes,
   n_classes,
   byrow = FALSE
@@ -431,24 +390,57 @@ sqrt() %>%
 
 
 
-confusion_counts
-confusion_adjusted
-
-acc_traditional
-acc_adjusted
-
-cnt$area
-area_adjusted
-confidence_area_adjusted
-
-oa_se
-pa_se
-ua_se
-
 
 fo <- file(opt$output, "w")
+sink(file = fo, append = TRUE, type = "output")
+
+cat("Traditional Accuracy assessment\n")
+cat("-----------------------------------------------------------------\n")
+cat("\n")
+cat("Traditional confusion matrix, expressed in terms of pixel counts:\n")
+cat("\n")
+print(confusion_counts)
+cat("\n")
+cat(sprintf("Overall Accuracy (OA): %.2f\n", acc_traditional$oa))
+cat("\n")
+print_stats <- cbind(
+  sprintf("%.2f", acc_traditional$pa),
+  sprintf("%.2f", acc_traditional$ua),
+  sprintf("%.2f", acc_traditional$oe),
+  sprintf("%.2f", acc_traditional$ce)
+)
+colnames(print_stats) <- c("Producer's Accuracy", "User's Accuracy", "Error of Omission", "Error of Commission")
+rownames(print_stats) <- classes
+print(print_stats, quote = FALSE)
+cat("\n")
+cat("\n")
+cat("Area-Adjusted Accuracy\n")
+cat("-----------------------------------------------------------------\n")
+cat("\n")
+cat("Confusion matrix, expressed in terms of proportion of area:\n")
+cat("\n")
+print(confusion_adjusted)
+cat("\n")
+cat(sprintf("Overall Accuracy (OA): %.2f \u00b1 %.2f\n", acc_adjusted$oa, oa_se))
+cat("\n")
+print_stats <- cbind(
+  sprintf("%.2f \u00b1 %.2f", acc_adjusted$pa, pa_se),
+  sprintf("%.2f \u00b1 %.2f", acc_adjusted$ua, ua_se),
+  sprintf("%.2f \u00b1 %.2f", acc_adjusted$oe, pa_se),
+  sprintf("%.2f \u00b1 %.2f", acc_adjusted$ce, ua_se)
+)
+colnames(print_stats) <- c("Producer's Accuracy", "User's Accuracy", "Error of Omission", "Error of Commission")
+rownames(print_stats) <- classes
+print(print_stats, quote = FALSE)
+cat("\n")
+print_area <- cbind(
+  sprintf("%.2f \u00b1 %.2f", area_adjusted, confidence_area_adjusted),
+  sprintf("%.2f", count$area)
+)
+colnames(print_area) <- c("Estimated Area", "Mapped Area")
+rownames(print_area) <- classes
+print(print_area, quote = FALSE)
 
-cat("# Accuracy assessment", file = fo)
-
+sink(file = NULL)
 close(fo)
 

From 7a36cb02e967c237ea8565af4fd24027580b2b1f Mon Sep 17 00:00:00 2001
From: David Frantz <david.frantz@uni-trier.de>
Date: Fri, 2 Feb 2024 15:36:47 +0100
Subject: [PATCH 21/23] finalized stratified sampling, code still messy,
 commenting still necessary

---
 src/aux-level/_stratified-sample.c | 262 +++++++++++++++++++++++------
 1 file changed, 212 insertions(+), 50 deletions(-)

diff --git a/src/aux-level/_stratified-sample.c b/src/aux-level/_stratified-sample.c
index 34f01b3f..70145974 100644
--- a/src/aux-level/_stratified-sample.c
+++ b/src/aux-level/_stratified-sample.c
@@ -37,9 +37,12 @@ This program computes a histogram of the given image
 #include "../cross-level/konami-cl.h"
 #include "../cross-level/string-cl.h"
 #include "../cross-level/table-cl.h"
+#include "../cross-level/dir-cl.h"
 
 /** Geospatial Data Abstraction Library (GDAL) **/
 #include "gdal.h"           // public (C callable) GDAL entry points
+#include "ogr_spatialref.h" // coordinate systems services
+#include "ogr_api.h"        // OGR geometry and feature definition
 
 
 typedef struct {
@@ -49,26 +52,35 @@ typedef struct {
   char file_input_sample_size[NPOW_10];
   char file_output[NPOW_10];
   char column_sample_size[NPOW_10];
+  char format[NPOW_10];
 } args_t;
 
 
 void usage(char *exe, int exit_code){
 
 
-  printf("Usage: %s [-h] [-v] [-i] [-b band] [-o output-file] input-image\n", exe);
+  printf("Usage: %s [-h] [-v] [-i] [-b band] [-o output-file] [-f format] \n", exe);
+  printf("       classification sample-size-table sample-size-column \n");
   printf("\n");
   printf("  -h  = show this help\n");
   printf("  -v  = show version\n");
   printf("  -i  = show program's purpose\n");
   printf("\n");
-  printf("  -b band = band to use,\n");
+  printf("  -b band = band to use in classification image,\n");
   printf("     defaults to 1\n");
-  printf("\n");
   printf("  -o output-file  = output file path with extension,\n");
-  printf("     defaults to './histogram.csv'\n");
+  printf("     defaults to './sample.gpkg'\n");
+  printf("  -f format  = output format (GDAL vector driver short name)\n");
+  printf("     defaults to GPKG\n");
   printf("\n");  
   printf("  Positional arguments:\n");
-  printf("  - 'input-image': image for computing the histogram\n");
+  printf("  - 'classification':     classification image\n");
+  printf("  - 'sample-size-table':  table with sample size per class with at least 3 named columns\n");
+  printf("                          class: class ID, needs to match classes in classification\n");
+  printf("                          count: number of pixels per class\n");
+  printf("                          ***:   number of sample points per class;.\n");
+  printf("                                 column name is variable (see next argument)\n");
+  printf("  - 'sample-size-column': column name of sample size\n");
   printf("\n");
 
   exit(exit_code);
@@ -78,16 +90,19 @@ void usage(char *exe, int exit_code){
 
 void parse_args(int argc, char *argv[], args_t *args){
 int opt;
+bool o, f;
 
 
   opterr = 0;
 
   // default parameters
-  copy_string(args->file_output,  NPOW_10, "histogram.csv");
+  copy_string(args->file_output,  NPOW_10, "sample.gpkg");
+  copy_string(args->format, NPOW_10, "GPKG");
   args->band = 1;
+  o = f = false;
 
   // optional parameters
-  while ((opt = getopt(argc, argv, "hvio:b:")) != -1){
+  while ((opt = getopt(argc, argv, "hvio:f:b:")) != -1){
     switch(opt){
       case 'h':
         usage(argv[0], SUCCESS);
@@ -100,9 +115,13 @@ int opt;
       case 'o':
         copy_string(args->file_output, NPOW_10, optarg);
         break;
+      case 'f':
+        copy_string(args->format, NPOW_10, optarg);
+        f = true;
+        break;
       case 'b':
         args->band = atoi(optarg);
-        if (args->band < 0){
+        if (args->band < 1){
           fprintf(stderr, "Band must be >= 1\n");
           usage(argv[0], FAILURE);  
         }
@@ -141,6 +160,14 @@ int opt;
     usage(argv[0], FAILURE);
   }
 
+  if ((!o && f) || (!f && o)){
+    fprintf(stderr, "If -f is given, -o need to be given, too.\n"); usage(argv[0], FAILURE);
+  }
+
+  if (fileexist(args->file_output)){
+    fprintf(stderr, "sample already exists: %s.\n", args->file_output); usage(argv[0], FAILURE);
+  }
+
   return;
 }
 
@@ -149,17 +176,39 @@ int main(int argc, char *argv[]){
 args_t args;
 GDALDatasetH  fp;
 GDALRasterBandH band;
-int i, j, nx, ny;
+int i, j, nx, ny, nbands;
 short *line = NULL;
 short nodata;
 int has_nodata;
 table_t sample_size;
 int col_size, col_class, col_count, row;
+char *wkt = NULL;
+char projection[NPOW_10];
+double geotran[6];
 
 int offset = SHRT_MAX+1;
 int length = USHRT_MAX+1;
 double **dictionary = NULL;
-enum { _CLASS_, _SIZE_, _PROBABILITY_, _COUNT_, _DICT_LENGTH_ };
+enum { _DICT_CLASS_, _DICT_SIZE_, _DICT_PROBABILITY_, _DICT_COUNT_, _DICT_LENGTH_ };
+
+table_t sample;
+int count = 0;
+enum { _SAMPLE_CLASS_, _SAMPLE_X_, _SAMPLE_Y_, _SAMPLE_PROBABILITY_, _SAMPLE_LENGTH_ };
+
+double probability;
+
+
+int class_ID, size_target, size_sampled;
+int row2, row_max;
+double max;
+OGRSFDriverH driver;
+OGRSpatialReferenceH srs;
+GDALDatasetH fp_out;
+OGRLayerH layer;
+OGRFieldDefnH field;
+OGRFeatureH feature;
+OGRGeometryH point;
+
 
   parse_args(argc, argv, &args);
 
@@ -171,44 +220,51 @@ enum { _CLASS_, _SIZE_, _PROBABILITY_, _COUNT_, _DICT_LENGTH_ };
   if ((col_count = find_table_col(&sample_size, "count")) < 0){
     printf("could not find column name %s in file-sample-size\n", "count"); exit(FAILURE);}
 
-  print_table(&sample_size, false);
+  #ifdef FORCE_DEBUG
+  print_table(&sample_size, false, false);
   printf("column %s in column %d\n", "class", col_class);
   printf("column %s in column %d\n", "count", col_count);
   printf("column %s in column %d\n", args.column_sample_size, col_size);
-
   printf("min/max class: %d/%d\n", (int)sample_size.min[col_class], (int)sample_size.max[col_class]);
-
-
+  #endif
 
   alloc_2D((void***)&dictionary, length, _DICT_LENGTH_, sizeof(double));
 
   for (row=0; row<sample_size.nrow; row++){
-    dictionary[(int)sample_size.data[row][col_class] + offset][_CLASS_] = sample_size.data[row][col_class];
-    dictionary[(int)sample_size.data[row][col_class] + offset][_SIZE_] = sample_size.data[row][col_size];
-    dictionary[(int)sample_size.data[row][col_class] + offset][_PROBABILITY_] = sample_size.data[row][col_size] / sample_size.data[row][col_count];
+    dictionary[(int)sample_size.data[row][col_class] + offset][_DICT_CLASS_] = sample_size.data[row][col_class];
+    dictionary[(int)sample_size.data[row][col_class] + offset][_DICT_SIZE_] = sample_size.data[row][col_size];
+    dictionary[(int)sample_size.data[row][col_class] + offset][_DICT_PROBABILITY_] = sample_size.data[row][col_size] / sample_size.data[row][col_count] * 2;
+  }
 
+  #ifdef FORCE_DEBUG
+  for (row=0; row<sample_size.nrow; row++){
     printf("%.2f %.2f %.12f\n", 
-      dictionary[(int)sample_size.data[row][col_class] + offset][_CLASS_],
-      dictionary[(int)sample_size.data[row][col_class] + offset][_SIZE_],
-      dictionary[(int)sample_size.data[row][col_class] + offset][_PROBABILITY_]);
+      dictionary[(int)sample_size.data[row][col_class] + offset][_DICT_CLASS_],
+      dictionary[(int)sample_size.data[row][col_class] + offset][_DICT_SIZE_],
+      dictionary[(int)sample_size.data[row][col_class] + offset][_DICT_PROBABILITY_]);
   }
+  #endif
 
 
-  srand(time(NULL)); 
-  printf("random number: %f\n", (double) (rand() % (size_t) sample_size.sum[col_count])/sample_size.sum[col_count]);
-  printf("random number: %f\n", (double) (rand() % (size_t) sample_size.sum[col_count])/sample_size.sum[col_count]);
-  printf("random number: %f\n", (double) (rand() % (size_t) sample_size.sum[col_count])/sample_size.sum[col_count]);
-  printf("random number: %f\n", (double) (rand() % (size_t) sample_size.sum[col_count])/sample_size.sum[col_count]);
-  printf("random number: %f\n", (double) (rand() % (size_t) sample_size.sum[col_count])/sample_size.sum[col_count]);
 
+  sample = allocate_table(sample_size.sum[col_size] * 10, _SAMPLE_LENGTH_, false, false);
+  memset(sample.row_mask, 0, sizeof(bool) * sample_size.sum[col_size] * 10);
 
   GDALAllRegister();
   if ((fp = GDALOpen(args.file_input_image, GA_ReadOnly)) == NULL){
     fprintf(stderr, "could not open %s.\n", args.file_input_image); exit(1);}
 
-  nx  = GDALGetRasterXSize(fp);
-  ny  = GDALGetRasterYSize(fp);
+  nx     = GDALGetRasterXSize(fp);
+  ny     = GDALGetRasterYSize(fp);
+  nbands = GDALGetRasterCount(fp);
+
+  GDALGetGeoTransform(fp, geotran);
+  wkt = (char*)GDALGetProjectionRef(fp);
+  copy_string(projection, NPOW_10, wkt);
+  wkt = projection;
   
+  if (args.band > nbands){
+    fprintf(stderr, "Input image has %d band(s), band %d was requested.\n", nbands, args.band); exit(1);}
   alloc((void**)&line, nx, sizeof(short));
 
   band = GDALGetRasterBand(fp, args.band);
@@ -219,17 +275,9 @@ enum { _CLASS_, _SIZE_, _PROBABILITY_, _COUNT_, _DICT_LENGTH_ };
     exit(1);
   }
 
- int cnt = 0;
-double probability;
- /** FILE *fp_ = NULL;
-  fp_ = fopen("test.txt", "w");
-  for (i=0; i<346589861; i++){
-    probability = (double)rand()/(double)RAND_MAX;
-    fprintf(fp_, "%f\n", probability);
-  }
-  fclose(fp_);
-  exit(0);
- **/
+
+  srand(time(NULL)); 
+
   for (i=0; i<ny; i++){
 
     if (GDALRasterIO(band, GF_Read, 0, i, nx, 1, 
@@ -240,28 +288,34 @@ double probability;
 
       if (line[j] == nodata) continue;
 
-      //probability = (rand() % (size_t) sample_size.sum[col_count])/sample_size.sum[col_count]*100;
       probability = (double)rand()/(double)RAND_MAX;
 
-      //printf("probability: %f\n", probability);
+      if (dictionary[line[j] + offset][_DICT_PROBABILITY_] >= probability){
+        dictionary[line[j] + offset][_DICT_COUNT_]++;
+        //printf("%d samples pre-selected (row %d, col %d)\n", count, i, j);
+
+        if (count < sample.nrow){
+          sample.data[count][_SAMPLE_CLASS_] = line[j];
+          sample.data[count][_SAMPLE_X_] = geotran[0] + j * geotran[1] + geotran[1] * 0.5;
+          sample.data[count][_SAMPLE_Y_] = geotran[3] + i * geotran[5] + geotran[5] * 0.5;
+          sample.data[count][_SAMPLE_PROBABILITY_] = probability;
+          sample.row_mask[count] = true;
+          count++;
+        }
 
-      if (dictionary[line[j] + offset][_PROBABILITY_] >= probability){
-        dictionary[line[j] + offset][_COUNT_]++;
-        printf("%d samples selected (row %d, col %d)\n", cnt++, i, j);
       }
-        
 
     }
 
   }
 
 
- for (row=0; row<sample_size.nrow; row++){
+  for (row=0; row<sample_size.nrow; row++){
     printf("%.2f, %.2f, %.12f, %.0f,\n", 
-      dictionary[(int)sample_size.data[row][col_class] + offset][_CLASS_],
-      dictionary[(int)sample_size.data[row][col_class] + offset][_SIZE_],
-      dictionary[(int)sample_size.data[row][col_class] + offset][_PROBABILITY_],
-      dictionary[(int)sample_size.data[row][col_class] + offset][_COUNT_]);
+      dictionary[(int)sample_size.data[row][col_class] + offset][_DICT_CLASS_],
+      dictionary[(int)sample_size.data[row][col_class] + offset][_DICT_SIZE_],
+      dictionary[(int)sample_size.data[row][col_class] + offset][_DICT_PROBABILITY_],
+      dictionary[(int)sample_size.data[row][col_class] + offset][_DICT_COUNT_]);
   }
 
 
@@ -269,10 +323,118 @@ double probability;
 
   free((void*)line);
 
+  for (row=0; row<sample_size.nrow; row++){
+
+    class_ID     = dictionary[(int)sample_size.data[row][col_class] + offset][_DICT_CLASS_];
+    size_target  = dictionary[(int)sample_size.data[row][col_class] + offset][_DICT_SIZE_];
+    size_sampled = dictionary[(int)sample_size.data[row][col_class] + offset][_DICT_COUNT_];
+
+    while (size_target < size_sampled){
+
+      max = -1.0;
+      row_max = 0;
+
+      for (row2=0; row2<count; row2++){
+
+        if (sample.data[row2][_SAMPLE_CLASS_] != class_ID) continue;
+        if (!sample.row_mask[row2]) continue;
+
+        if (sample.data[row2][_SAMPLE_PROBABILITY_] > max){
+          max = sample.data[row2][_SAMPLE_PROBABILITY_];
+          row_max = row2;
+        }
+
+      }
+
+      printf("class %d, size %d <-> %d, row_max %d, max_prob %f\n", 
+        class_ID, size_target, size_sampled, row_max, max);
+
+      sample.row_mask[row_max] = false;
+      
+      size_sampled--;
 
+    }
+
+  }
+
+
+
+  OGRRegisterAll();
+
+  // get driver
+  if ((driver = OGRGetDriverByName(args.format)) == NULL){
+    fprintf(stderr, "%s driver is not available.\n", args.format); usage(argv[0], FAILURE);
+  }
+
+  
+  // create file
+  if ((fp_out = OGR_Dr_CreateDataSource(driver, args.file_output, NULL)) == NULL){
+    printf("Error creating output file.\n"); return FAILURE;}
+    
+
+  // set output projection
+  srs = OSRNewSpatialReference(NULL);
+  OSRImportFromWkt(srs, &wkt);
+
+  // create layer
+  if ((layer = OGR_DS_CreateLayer(fp_out, "sample", srs, 
+        wkbPoint, NULL)) == NULL){
+    printf("Error creating layer.\n"); return FAILURE;}
+
+  // add field
+  field = OGR_Fld_Create("FID", OFTInteger);
+  OGR_Fld_SetWidth(field, 4);
+  if (OGR_L_CreateField(layer, field, TRUE) != OGRERR_NONE){
+      printf("Error creating field.\n"); return FAILURE;}
+  OGR_Fld_Destroy(field);
+
+  // add field
+  field = OGR_Fld_Create("label_map", OFTInteger);
+  OGR_Fld_SetWidth(field, 4);
+  if (OGR_L_CreateField(layer, field, TRUE) != OGRERR_NONE){
+      printf("Error creating field.\n"); return FAILURE;}
+  OGR_Fld_Destroy(field);
+
+  // add field
+  field = OGR_Fld_Create("label_reference", OFTInteger);
+  OGR_Fld_SetWidth(field, 4);
+  if (OGR_L_CreateField(layer, field, TRUE) != OGRERR_NONE){
+      printf("Error creating field.\n"); return FAILURE;}
+  OGR_Fld_Destroy(field);
+
+  count = 0;
+
+  for (row=0; row<sample.nrow; row++){
+
+    if (!sample.row_mask[row]) continue;
+
+    // create feature
+    feature = OGR_F_Create(OGR_L_GetLayerDefn(layer));
+
+    // set fields
+    OGR_F_SetFieldInteger(feature, OGR_F_GetFieldIndex(feature, "FID"), count);
+    OGR_F_SetFieldInteger(feature, OGR_F_GetFieldIndex(feature, "label_map"), sample.data[row][_SAMPLE_CLASS_]);
+    OGR_F_SetFieldInteger(feature, OGR_F_GetFieldIndex(feature, "label_reference"), 0);
+
+    // create local geometry
+    point = OGR_G_CreateGeometry(wkbPoint);
+    OGR_G_SetPoint_2D(point, 0, sample.data[row][_SAMPLE_X_], sample.data[row][_SAMPLE_Y_]);
+    OGR_F_SetGeometry(feature, point);
+    OGR_G_DestroyGeometry(point);
+
+    // create feature in the file
+    if (OGR_L_CreateFeature(layer, feature) != OGRERR_NONE){
+      printf("Error creating feature in file.\n"); return FAILURE;}
+    OGR_F_Destroy(feature);
+
+    count++;
+
+  }
 
+  OGR_DS_Destroy(fp_out);
 
 
+  free_table(&sample);
   free_table(&sample_size);
   free_2D((void**)dictionary, length);
 

From 7c8a97cabd47d373d3ce1875feeda421eb9d9edc Mon Sep 17 00:00:00 2001
From: David Frantz <david.frantz@uni-trier.de>
Date: Fri, 2 Feb 2024 15:39:32 +0100
Subject: [PATCH 22/23] install R validation scripts

---
 Makefile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 83b2fcb8..aa844159 100755
--- a/Makefile
+++ b/Makefile
@@ -119,7 +119,7 @@ lower: table_ll param_ll meta_ll cube_ll equi7_ll glance7_ll atc_ll sunview_ll r
 higher: param_hl progress_hl tasks_hl read-aux_hl read-ard_hl quality_hl bap_hl level3_hl cso_hl tsa_hl index_hl interpolate_hl stm_hl fold_hl standardize_hl pheno_hl polar_hl trend_hl ml_hl texture_hl lsm_hl lib_hl sample_hl imp_hl cfimp_hl l2imp_hl spec-adjust_hl pyp_hl rsp_hl udf_hl
 aux: param_aux param_train_aux train_aux
 exe: force force-parameter force-qai-inflate force-tile-finder force-tabulate-grid force-l2ps force-higher-level force-train force-lut-modis force-mdcp force-stack force-import-modis force-cube-init force-hist force-stratified-sample
-.PHONY: temp all install install_ bash python external clean build check
+.PHONY: temp all install install_ bash python rstats external clean build check
 
 ### TEMP
 
@@ -478,6 +478,8 @@ python: temp
 
 rstats: temp
 	cp $(DR)/force-level2-report.Rmd $(TM)/force-level2-report.Rmd
+	cp $(DR)/force-sample-size.r $(TB)/force-sample-size
+	cp $(DR)/force-map-accuracy.r $(TB)/force-map-accuracy
 
 install: bash python rstats external install_ clean check
 

From 43c5e6553053b2c32d9d2e1de091e634d85ffdbd Mon Sep 17 00:00:00 2001
From: David Frantz <david.frantz@uni-trier.de>
Date: Fri, 2 Feb 2024 15:43:31 +0100
Subject: [PATCH 23/23] updated docs

---
 docs/source/history/vdev.rst | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/docs/source/history/vdev.rst b/docs/source/history/vdev.rst
index a54dc04f..460cc63b 100755
--- a/docs/source/history/vdev.rst
+++ b/docs/source/history/vdev.rst
@@ -68,7 +68,20 @@ Develop version
 
   - new auxilliary program `force-hist`.
     This program computes the histogram of image values (can be vrt) and writes a csv table.
-    This is intended to be used in a validation workflow (planned feature).
+    This is intended to be used in a validation workflow.
+
+  - new auxilliary program `force-sample-size`.
+    This program computes the required sample size following Olofsson et al. 2013/2014, 
+    which should be used to properly validate a classification map.
+    This is intended to be used in a validation workflow.
+
+  - new auxilliary program `force-stratified-sample`.
+    This program draws a stratified random sample based on a classification map and sample size computations.
+    This is intended to be used in a validation workflow.
+
+  - new auxilliary program `force-map-accuracy`.
+    This program computes area-adjusted accuracies following Olofsson et al. 2013/2014.
+    This is intended to be used in a validation workflow.
 
   - ``force-tabulate-grid`` has been updated to produce properly named output files.
     The default output file name is ``grid.kml``, created in the current directory, using the ``KML`` format.