Dataset Processing/Readiness

Author

Manuel Gebetsberger, LuftBlick

Published

March 10, 2026

Introduction

This report is to summarize which datasets we have on our list: which are todo,assigned,or finished. Furthermore, in combination with the dataheader query we can derive which dataproducts are available. The list of datasets we need to consider is based on the BOPPO query which is done regularly (datasets.html), and stores the basic information on the server under /mnt/blick_storage/inventories. The underlying file is the rda-file /mnt/blick_storage/inventories/BOPPO_datasets.rda, which is used for multiple SW parts (BOPPO html, PGN dataset app, CalCoordination app, PGN2EVDC).

Get available L2 datsaets

the available datasets are based on a big query as part of the BOPPO. FIltering is done by more than >=15 L0 files and subsetting by only s1 instruments.

No testing sites

Code
notest <- !(datasets$Location %in% c("ElkridgeMD","Aldino","LabSciGlob","ColumbiaMD",
                                     "GreenbeltMD","LabGSFC",
                                     "Innsbruck","LabIBK","Innsbruck-FKS","LabLuftBlick"))
datasets_todo <- datasets[notest,]

# adding true sites at testing location
datasets_todo <- rbind(datasets_todo,
                        datasets[datasets$PanID == 110 & datasets$Location == "Innsbruck",],
                        datasets[datasets$PanID == 106 & datasets$Location == "Innsbruck",],
                        datasets[datasets$PanID == 2   & datasets$Location == "GreenbeltMD",],
                        datasets[datasets$PanID == 32  & datasets$Location == "GreenbeltMD",],
                        datasets[datasets$PanID == 71  & datasets$Location == "GreenbeltMD",])


# skip datasets with less than 15 L0 files
subID <- which(datasets_todo$L0files >= 15)
datasets_todo <- datasets_todo[subID,]


#order datasets by panID
datatable(datasets_todo,
          options = list(pageLength = 10),
          caption = "List of all available datasets",)
Code
# Currently only S1 are officially todo
datasets_todo <- subset(datasets_todo,SpecID == "s1")

sub_todo <- datasets_todo

Get Processed L2 files

Code
getDataHeader <- function(res){
  df <- data.frame(stringsAsFactors = FALSE)
  
  for (i in names(res)) {
    # temporary 
    content_json <- res[[i]]
    
    # Extract the fields, returning NA if the field does not exist
    PanID          <- ifelse(is.null(content_json$panID), NA, content_json$panID)
    SpecID         <- ifelse(is.null(content_json$specID), NA, content_json$specID)
    Location       <- ifelse(is.null(content_json$location), NA, content_json$location)
    Rcode          <- ifelse(is.null(content_json$data_product), NA, content_json$data_product)
    Start          <- ifelse(is.null(content_json$dateFirstL1processed), NA, content_json$dateFirstL1processed)
    End            <- ifelse(is.null(content_json$dateLastL1processed), NA, content_json$dateLastL1processed)
    GenerationDate <- ifelse(is.null(content_json$dateFirstL1processed), NA, content_json$dateFirstL1processed)
    Suffix         <- ""
    Filename       <- ifelse(is.null(content_json$filename), NA, content_json$filename)
    size_kb        <- ifelse(is.null(content_json$size_kb), NA, content_json$size_kb)
    
    # Add the row to the dataframe
    df <- rbind(df, data.frame(PanID, SpecID, Location, Rcode, Start,End, GenerationDate,Suffix, Filename, size_kb,stringsAsFactors = FALSE))
  }
  
  df$Start <- as.POSIXct(df$Start,format="%Y-%m-%d",tz = "UTC")
  df$End   <- as.POSIXct(df$End,format="%Y-%m-%d",tz = "UTC")
  df$GenerationDate <- as.POSIXct(df$GenerationDate,format="%Y-%m-%d",tz = "UTC")
  #df$GenerationDate <- as.POSIXct(df$GenerationDate,format="%Y%m%dT%H%M%OS",tz = "UTC")
  
  
  return(df)
  
}

datastorage at ovh

Now we can make use of the L2 processing inventory which based on a constantly running python script scraping the L2 directories and looking at the date when the first L1 processed the first time and filesize. Since the processor is always producing an output, we simply skip L2 files with a size smaller than 18kb. This is a typical size when there are only header lines in the L2.

Only official PGN and official dataset

Among the available dataheaders, I subset by the ones that are available only in the operational DB (no suffix) and only spectrometer 1 datasets. And the datasets, we have on the list, are only the official PGN instruments

Let us reshape the entire dataframe of available dataheaders

Get processing status for all datasetes (old way)

this is the old way by looking only into out2 or out0 availability

Code
status_17    <- status_18 <- rcodes <- list()

#column indices where the rcode matches
rcodeIndex <- which(colnames(reshaped_df) %in% officialRcodes)
for (i in 1:nrow(datasets_todo)){
  
  datasetName <- sprintf("%s-%s",
                         datasets_todo$PanID[i],
                         datasets_todo$Location[i])
  
  availSubset <- subset(officialHeaders,PanID == datasets_todo$PanID[i] & Location == datasets_todo$Location[i])
  availRcodes <- subset(reshaped_df,    PanID == datasets_todo$PanID[i] & Location == datasets_todo$Location[i])[,rcodeIndex]
  
  p17 <- FALSE
  p18 <- FALSE
  
  if (nrow(availSubset) != 0){
    rcodes[[i]]    <- availRcodes
    if ("rout2p1-8" %in% availSubset$Rcode){
      p18 <- TRUE
    }
    if ("rout0p1-7" %in% availSubset$Rcode){
      p17 <- TRUE
    }
  }else{
    tmp_rcodes <- rep(NA,length(rcodeIndex))
    #names(tmp_rcodes) <- names(reshaped_df)[-c(1,2,3)]
    rcodes[[i]] <- tmp_rcodes
    
  }
  status_17[[i]] <- p17
  status_18[[i]] <- p18
  
  rm(availRcodes)
  
}


status_17 <- unlist(status_17)
status_18 <- unlist(status_18)

datasets_todo$status_p17 <- status_17
datasets_todo$status_p18 <- status_18




# initial status that everything has to be done
status <- rep("todo",nrow(datasets_todo))
# those where only 1.7 are available
status[status_17] <- "p1-7"
# those were 1.8 are avaible, but it is also possible
status[status_18] <- "p1-8"

# those where 1.7 and 1.8 exists are transfered
id <- status_17 & status_18 
status[id] <- "transfered"

#status[datasets_todo$QCaction == "fieldcal"] <- "todo"

datasets_todo$status_L2_old <- status
datasets_todo$status_L2_old[which(is.na(datasets_todo$LastFinishedDate))] <- "todo"



rcodes <- as.data.frame(myRbind(rcodes))
productAvailabilityDF <- rcodes
productAvailabilityDF <- data.frame(productAvailabilityDF)
colnames(productAvailabilityDF) <- colnames(rcodes)

# order by processor versions
productAvailabilityDF <- productAvailabilityDF[,(officialRcodes)]


# set default status to #incomplete
statusL2 <- rep("incomplete",nrow(productAvailabilityDF))

# check if the sum available dates of the expected subdatasets is given
p17        <- apply(!is.na(productAvailabilityDF[,official_p17]),1,sum)     == length(official_p17)
p18        <- apply(!is.na(productAvailabilityDF[,official_p18]),1,sum)     == length(official_p18)
p18_oob    <- apply(!is.na(productAvailabilityDF[,official_p18_oob]),1,sum) == length(official_p18_oob)
transfered <- apply(!is.na(productAvailabilityDF[,transferedCodes]),1,sum)  == length(transferedCodes)

# check is there is no processed dataproduct avaialble at all
todo       <- apply(!is.na(productAvailabilityDF),1,sum) == 0


status <- cbind(p17,transfered,p18,p18_oob,todo)


statusL2[todo]              <- "nothing"
statusL2[p17 & !transfered] <- "p1-7"
statusL2[transfered & p18]  <- "transfered"
statusL2[p18 & !p17]        <- "p1-8"
statusL2[p18_oob & !p18]    <- "p1-8_OoB"


datasets_todo$status_L2_new <- statusL2
datasets_todo$status_L2     <- statusL2

datasets_todo$status_L2[statusL2 %in% c("incomplete","nothing")] <- "todo"
datasets_todo$status_L2[statusL2 %in% c("p1-8_OoB")] <- "p1-8"
datasets_todo <- cbind(datasets_todo,productAvailabilityDF)

sub_todo <- datasets_todo



# add the first processesed date which is the date of the first OoB product
firstProcessedDate <- rep("NA",nrow(productAvailabilityDF))





# get cells which have p1-7NO2
idEVDC17 <- which(!is.na(sub_todo$EVDCuploadDone != "") & (sub_todo$EVDC_NO2 == "rnvs1p1-7" ))
sub_todo$EVDCUploadDone_p17 <- NA
sub_todo$EVDCUploadDone_p17[idEVDC17] <- sub_todo$EVDCuploadDone[idEVDC17]

# get cells which have the p1-8 NO2
idEVDC18 <- which(sub_todo$EVDCuploadDone >= sub_todo$EVDCuploadRequest  & grepl(sub_todo$EVDC_NO2,pattern="rnvs3p1-8"))
sub_todo$EVDCUploadDone_p18 <- NA
sub_todo$EVDCUploadDone_p18[idEVDC18] <- sub_todo$EVDCuploadDone[idEVDC18]

sub_todo <- sub_todo[order(sub_todo$PanID),]
write.csv(sub_todo,file="PGN_datasetlist_todo.csv")

EVDC status from database

Code
get_evdc_uploads <- function(){
    qrystring <- "{allEvdcUploads {
  results {
    filename
    uploadDate
  }
}}"
    qry <- Query$new()
    qry$query("evdc_uploads", qrystring)
    con <- GraphqlClient$new("https://hetzner.pandonia-global-network.org/query")
    raw <- con$exec(qry$queries$evdc_uploads)
    res <- jsonlite::fromJSON(raw)$data[[1]]
    if (length(res) == 0) {
        res <- list()
    }
    else {
      res <- res$results
      res <- res[order(res$uploadDate, decreasing = F), ]
    }
    return(res)
}

data <- get_evdc_uploads()
Code
# Read the JSON file into R (adjust the file path as needed)
#json_file <- "metadata_evdcupload_data.json"
#raw <- fromJSON(json_file)

# Convert the JSON data to a tibble (data frame)
#data <- as_tibble(raw)


#data <- read.csv(file = "C:/LuftBlick/ESA-reports/POP2_FRM4AQ/metadata.evdcupload.csv")
#data <- data[order(data$upload_date),]
# skip the id
#data <- data[,-c(1,5)]



df <- data %>%
  separate(filename, 
           into = c("observation", "method", "instrument", "product", "location","startdate","enddate","version"), 
           sep = "_", 
           remove = FALSE,
    fill   = "right"    # pad missing pieces on the right with NA
  ) %>%
  # 1) stash the originals
  mutate(
    orig_product   = product,
    orig_location  = location,
    orig_startdate = startdate,
    orig_enddate   = enddate
  ) %>%
  # 2) on rows with missing version, shift everything down one slot
  mutate(
    # new product is always your default when version is missing
    product   = if_else(is.na(version), "rnvs1.1.7", product),
    # everything else just borrows from the column to its left *before* we mutate
    location  = if_else(is.na(version), orig_product, location),
    startdate = if_else(is.na(version), orig_location, startdate),
    enddate   = if_else(is.na(version), orig_startdate, enddate),
    version   = if_else(is.na(version), orig_enddate, version)
  ) %>%
  # 3) drop the stashed originals
  select(-orig_product, -orig_location, -orig_startdate, -orig_enddate)




df <- df %>%
  # Identify product type: if it starts with "rd.", mark as RD; otherwise QA.
  mutate(prod_type = if_else(grepl("^rd\\.", product), "rd", "qa"),
         # For RD products, remove the "rd." prefix; for QA, keep it as-is.
         product_clean = if_else(prod_type == "rd", sub("^rd\\.", "", product), product)) %>%
  # Split the cleaned product string into three tokens using the dot as separator.
  separate(product_clean, into = c("tok1", "tok2", "tok3"), sep = "\\.", remove = FALSE) %>%
  # Create new columns based on product type:
  # For QA products, assign tok1 to qastatus.
  # For RD products, assign tok1 to retrieval.
  # In both cases, combine tok2 and tok3 with a dot to form the processor.
  mutate(qastatus = if_else(prod_type == "qa", tok1, NA_character_),
         retrieval = if_else(prod_type == "rd", tok1, NA_character_),
         processor = paste(tok2, tok3, sep = ".")) %>%
  # Optionally remove the temporary tokens.
  select(-tok1, -tok2, -tok3)


#split owner and instrument
df <- df %>%
  mutate(
    instrument_owner = str_extract(instrument, "^[A-Za-z]+"),
    instrument_id    = str_extract(instrument, "[0-9]+$")
  )

#df       <- df[order(as.Date(df$upload_date)),]
df$count <- 1

df$rv <- sprintf("%s_%s_%s",df$instrument_id,df$location,df$product_clean)
df$dataset <- sprintf("%s_%s_%s",df$instrument_id,df$location,df$processor)

# remove non-uploaded geoms-files
na_index <- is.na(df$uploadDate)
if (any(na_index)){
  df <- df[!na_index,]
}

File per product

Code
files_summary <- summary(as.factor(df$product_clean))

# subset by product_clean
products_avail <- unique(df$product_clean)
colors     <- qualitative_hcl(n = (length(products_avail)), h = c(-154, 253), c = 46, l = 79, register = )

total <- cumsum(df$count)
plot(Sys.Date(),1,ylim=c(0,max(files_summary)),xlim=range(as.Date(df$uploadDate)),type="l",xlab="Time [UTC]",ylab="# uploaded files",main="EVDC data transfer status in p1-8",col=1)

for (i in 1:length(products_avail)){
  retrieval_version <- products_avail[i]
  tmp <- df %>% filter(product_clean == retrieval_version)
  counts <- cumsum(tmp$count)
  points(as.Date(tmp$uploadDate),counts,col=colors[i],pch=16)
  #plot(as.Date(tmp$upload_date),counts,col=i+1,pch=16,main=product)
}
#legend("topleft",c(products_avail),col=c(colors),pch=16,bty="n")


legend("topleft",c(sprintf("Total: %s",sum(files_summary)),products_avail),col=c(NA,colors),pch=16,bty="n")
abline(v=Sys.Date(),lwd=2)
mtext(Sys.Date(),side=3,at=Sys.Date(),cex = 0.5)

First upload of a dataset

A Dataset here is defined by an instrument (e.g. nasa.gsfc021) and a location (e.g. bremen) and what the first date of any file was, independent if qa or rd, or even the code itself

Code
agg <- df %>%
  group_by(dataset) %>%
  summarize(min_upload_date = min(as.Date(uploadDate)))

agg <- agg %>%
  separate(
    dataset,
    into = c("instrument_id","location","processor"),
    sep = "_",
    remove=TRUE
  )

agg$count <- 1
agg<- agg[order(agg$min_upload_date),]
agg$min_upload_date <- as.Date(agg$min_upload_date)

agg_dataset <- agg

THe following table lists the upload datasets

Code
datatable(agg_dataset)
Code
plot(agg_dataset$min_upload_date,cumsum(agg_dataset$count),type="o",lwd=2,pch=16,cex=0.7,xlab="Time [UTC]",ylab="# datasets",main="First dataset upload")
abline(v=Sys.Date(),lwd=2)
mtext(Sys.Date(),side=3,at=Sys.Date(),cex = 0.5)

First retrieval version upload for a pan + location

Code
agg <- df %>%
  group_by(rv) %>%
  summarize(min_upload_date = min(as.Date(uploadDate)))

agg <- agg %>%
  separate(
    rv,
    into = c("instrument_id","location","product"),
    sep = "_",
    remove=TRUE
  )

agg$count <- 1
agg<- agg[order(agg$min_upload_date),]
agg$min_upload_date <- as.Date(agg$min_upload_date)
agg_rv <- agg
Code
datatable(agg_rv)
Code
reporting_period_start <- get_current_quarter_start()

make_summary <- function(x){
  res <- summary(as.factor(x$product))
  res <- res[order(res,decreasing = T)]  
  return(res)
}

rv_summary        <- make_summary(agg_rv)
rv_summary_period <- make_summary(subset(agg_rv,min_upload_date>=reporting_period_start))

plot(agg_dataset$min_upload_date,cumsum(agg_dataset$count),type="o",col=NA,xlab="Time [UTC]",ylab="# datasets",main="Retrieval version first upload")

rv_avail <- names(rv_summary)
for (i in 1:length(rv_avail)){
  retrieval_version <- rv_avail[i]
  tmp <- agg_rv %>% filter(product == retrieval_version)
  counts <- cumsum(tmp$count)
  lines(tmp$min_upload_date,counts,col=colors[i],pch=16,type="o")
}
legend("topleft",c(rv_avail),col=c(colors),pch=16,bty="n")

# add totals
mtext(rv_summary,side=4,       at=rv_summary,las=2,col=colors)
mtext(sprintf("+%s",rv_summary_period[names(rv_summary)]),side=4,at=rv_summary,las=2,col=colors,adj = 1.5,cex=0.8)

abline(v=Sys.Date(),lwd=2)
abline(v=reporting_period_start,lwd=2)

mtext(Sys.Date(),side=3,at=Sys.Date(),   cex = 0.5,adj = 0)
mtext(reporting_period_start,side=3,at=reporting_period_start,cex = 0.5,adj = 1)


abline(v=reporting_period_start,lwd=2)

create EVDC information and merge with inventory information

Code
geomsInfo <- jsonlite::fromJSON("https://hetzner.pandonia-global-network.org/geoms_locations")

name_on_todo_list   <- sprintf("%s_%s",
                               datasets_todo$PanID,
                               datasets_todo$Location)
Code
# get proper pgn name
locations_on_evdc <- unique(df$location)
geoms_names       <- tolower(geomsInfo$locations$geoms_name)

# overrule BOPPO columns
sub_todo$EVDC_NO2 <- NA
sub_todo$EVDC_NO2_sky <- NA
sub_todo$EVDC_H2O <- NA
sub_todo$EVDC_H2O_sky <- NA
sub_todo$EVDC_SO2 <- NA
sub_todo$EVDC_O3 <- NA
sub_todo$EVDC_HCHO <- NA
sub_todo$EVDC_HCHO_sky <- NA

sub_todo$EVDCuploadRequest <- NA
sub_todo$EVDCuploadDone    <- NA
sub_todo$status_EVDC  <- "planned/inPreparation"



# go over datasets from the todo_list
for (i in 1:nrow(sub_todo)){
  tmp <- sub_todo[i,]
  geoms_id      <- which(geomsInfo$locations$name == tmp$Location)

  if (length(geoms_id) ==0 ) next()
  
  # 1. check if we even have a GEOMS name for the PGN location
  existing_name <- geomsInfo$locations$geoms_name[geoms_id] != ""

  if (existing_name){
    # check if there are files on evdc
    geoms_name <- tolower(geomsInfo$locations$geoms_name[geoms_id])
    
    matching_rv <- agg_rv$location == geoms_name & as.numeric(agg_rv$instrument_id) == as.numeric(tmp$PanID)
    if (any(matching_rv)){
      available_on_evdc <- agg_rv[matching_rv,]
      #order by product so that nvs1 is first
      available_on_evdc <- available_on_evdc[order(available_on_evdc$product),]
      
      
      # 2. go over the available files and check what it is
      for (rv in 1:nrow(available_on_evdc)){
        available_rv <- available_on_evdc$product[rv]
        if (available_rv == "rnvs1.1.7"){
          sub_todo[i,"EVDC_NO2"] <- "rnvs1p1-7"
          sub_todo[i,"EVDCuploadRequest"] <- "2000-01-01"
          sub_todo[i,"EVDCuploadDone"]    <- as.character(available_on_evdc$min_upload_date[rv])
          sub_todo[i,"status_EVDC"]       <- "p1-7"
        }
        if (available_rv == "rnvs3.1.8"){
          sub_todo[i,"EVDC_NO2"] <- "rnvs3p1-8"
          sub_todo[i,"EVDCuploadRequest"] <- "2000-01-01"
          sub_todo[i,"EVDCuploadDone"]    <- as.character(available_on_evdc$min_upload_date[rv])
          sub_todo[i,"status_EVDC"]       <- "p1-8"
        }
        if (available_rv == "rout2.1.8"){
          sub_todo[i,"EVDC_O3"] <- "rout2p1-8"
          sub_todo[i,"EVDCuploadRequest"] <- "2000-01-01"
          sub_todo[i,"EVDCuploadDone"]    <- as.character(available_on_evdc$min_upload_date[rv])
           sub_todo[i,"status_EVDC"]       <- "p1-8"
       }           
        if (available_rv == "rfus5.1.8"){
          sub_todo[i,"EVDC_HCHO"] <- "rfus5p1-8"
          sub_todo[i,"EVDCuploadRequest"] <- "2000-01-01"
          sub_todo[i,"EVDCuploadDone"]    <- as.character(available_on_evdc$min_upload_date[rv])
          sub_todo[i,"status_EVDC"]       <- "p1-8"
        }
        if (available_rv == "rsus1.1.8"){
          sub_todo[i,"EVDC_SO2"] <- "rsus1p1-8"
          sub_todo[i,"EVDCuploadRequest"] <- "2000-01-01"
          sub_todo[i,"EVDCuploadDone"]    <- as.character(available_on_evdc$min_upload_date[rv])
          sub_todo[i,"status_EVDC"]       <- "p1-8"
        }
        if (available_rv == "rnvh3.1.8"){
          sub_todo[i,"EVDC_NO2_sky"] <- "rnvh3p1-8"
          sub_todo[i,"EVDCuploadRequest"] <- "2000-01-01"
          sub_todo[i,"EVDCuploadDone"]    <- as.character(available_on_evdc$min_upload_date[rv])
          sub_todo[i,"status_EVDC"]       <- "p1-8"
       }    
        if (available_rv == "rfuh5.1.8"){
          sub_todo[i,"EVDC_HCHO_sky"] <- "rfuh5p1-8"
          sub_todo[i,"EVDCuploadRequest"] <- "2000-01-01"
          sub_todo[i,"EVDCuploadDone"]    <- as.character(available_on_evdc$min_upload_date[rv])
          sub_todo[i,"status_EVDC"]       <- "p1-8"
        }         
      }
      
    }
    
  }

}

datatable(sub_todo)

Make EVDC status for each retrieval version

Code
sub_official <- sub_todo

# add gases
sub_official$EVDC_rnvs3 <- "planned"
sub_official$EVDC_rout2 <- "planned"
sub_official$EVDC_rsus1 <- "planned"
sub_official$EVDC_rfus5 <- "planned"
sub_official$EVDC_rwvt1 <- "planned"
sub_official$EVDC_rnvh3_H2O <- "planned"
sub_official$EVDC_rnvh3_NO2 <- "planned"
sub_official$EVDC_rfuh5_HCHO<- "planned"

sub_official[which(sub_official$EVDC_NO2 == "rnvs3p1-8"), "EVDC_rnvs3"] <- "cleared"
sub_official[which(sub_official$EVDC_O3 == "rout2p1-8"),  "EVDC_rout2"] <- "cleared"
sub_official[which(sub_official$EVDC_SO2 == "rsus1p1-8"),"EVDC_rfus5"] <- "cleared"
sub_official[which(sub_official$EVDC_HCHO== "rfus5p1-8"), "EVDC_rsus1"] <- "cleared"
sub_official[which(sub_official$EVDC_H2O == "rwvt1p1-8"), "EVDC_rwvt1"] <- "cleared"
sub_official[which(sub_official$EVDC_H2O_sky == "rnvh3p1-8"), "EVDC_rnvh3_H2O"]  <- "cleared"
sub_official[which(sub_official$EVDC_NO2_sky == "rnvh3p1-8"), "EVDC_rnvh3_NO2"]  <- "cleared"
sub_official[which(sub_official$EVDC_HCHO_sky == "rfuh5p1-8"),"EVDC_rfuh5_HCHO"] <- "cleared"


write.csv(sub_official,file="PGN_datasetlist_todo.csv")

subset PGN datasets only

Code
officials <- subset(sub_official,PGNstatus == "official")
write.csv(officials,file="PGN_datasetlist_official.csv")
Code
#| fig-height: 5
#| fig-width: 10plotProducts <- function(x){

plotProducts <- function(x){

  yrange <- 1/ncol(x)
  yticks <- seq(0,1,yrange)
  
  par(xaxt="n",yaxt="n",xaxs="i",yaxs="i")

  plot(NULL, xlim=c(0,1), ylim=c(0,1))    
  image(as.matrix(x),col=c(2,3),zlim=c(0,1),main="Product Availability",xlab="Dataset sorting by PanID")
  #abline(h=yticks,col=1,lty=1)
  
  #mtext(colnames(x),side=2,at=yticks[-1]-yrange/2,las=2)
}


#productAvailabilityDF <- officials[,officialRcodes]
#plotProducts(productAvailabilityDF)

New POp-2

derive statistics

Code
getProcessingStatus <- function(datasets,datasetGroups=NULL,timeCut=Sys.time()){
  # subsetting by only the official Rcodes and the timeRange
  datasets <- subset(datasets,as.POSIXct(PGNsince) <= timeCut)
  if (nrow(datasets) == 0){
    sprintf("No datasets are available before the timeCut of %s",as.character(timeCut))
    return(NA)
  }
  
  tokeep <- c('PanID', 'Location', 'WaitingRoom','PGNstatus','PGNsince',
            'L0start', 'L0end', 'L0files', 'AssignedDate',
            'EVDCuploadRequest', 'EVDCuploadDone', 'status_EVDC','LastFinishedDate')
            

  meta <- datasets[,tokeep]
  


  # subset the input dataframe by only the official rcodes
  # get all official rcodes
  officialRcodes <- unique(unlist(datasetGroups))

  df <- datasets[,officialRcodes]
  
  # convert the columns to proper timestamp
  #df[, ] <- lapply(df[, ], function(x) as.POSIXct(x, format="%Y-%m-%d %H:%M:%S"))
  df[, ] <- lapply(df[, ], function(x) as.POSIXct(x, format="%Y-%m-%d"))
  
  # After converting columns to POSIXct
  df[] <- lapply(names(df), function(col) {
    x <- df[[col]]
    x[x > timeCut] <- NA
    return(x)
  })
  

 
  # Vectorized function to check if all columns are non-NA for a given set of product codes
  all_non_na <- function(df, columns) {
    rowSums(!is.na(df[, columns])) == length(columns)
  }
  
  # Extract product codes from the time argument
  official_p17     <- datasetGroups$official_p17
  official_p18     <- datasetGroups$official_p18
  official_p18_oob <- datasetGroups$official_p18_oob
  transferedCodes  <- datasetGroups$transferedCodes
  
  # Default status
  status <- rep("incomplete", nrow(df))
  
  # Check product availability
  p17 <- all_non_na(df, official_p17) & !all_non_na(df, transferedCodes)
  p18 <- all_non_na(df, official_p18) & !all_non_na(df, official_p17)
  p18_oob <- all_non_na(df, official_p18_oob) & !all_non_na(df, official_p18)
  transfered <- all_non_na(df, transferedCodes) & all_non_na(df, official_p18)
  todo <- rowSums(!is.na(df)) == 0
  
  # Apply status logic
  status[todo]       <- "nothing"
  status[p17]        <- "p1-7"
  status[transfered] <- "transfered"
  status[p18]        <- "p1-8"
  status[p18_oob]    <- "p1-8_OoB"
  
  # combine todo's and incompletes and OoB's
  statusL2 <- status
  statusL2[which(statusL2 == "p1-8_OoB")]   <- "p1-8"
  statusL2[which(statusL2 == "incomplete")] <- "todo"
  statusL2[which(statusL2 == "nothing")]    <- "todo"
  
  # Optional: Check for any remaining "incomplete" statuses
  if(any(statusL2 == "incomplete")) {
    warning("Some rows have status 'incomplete' after processing.")
  }
  
  df$status_L2     <- statusL2
  df$status_L2_new <- status
  
  
  # get first processing timestamp
  # Derive the firstProcessing column
  df$firstProcessing <- NA

  df$firstProcessing[df$status_L2 == "p1-7"] <- apply(df[df$status_L2 == "p1-7", official_p17], 1, min, na.rm = TRUE)
  df$firstProcessing[df$status_L2 == "transfered"] <- apply(df[df$status_L2 == "transfered", transferedCodes], 1, min, na.rm = TRUE)
  df$firstProcessing[df$status_L2 == "p1-8"] <- apply(df[df$status_L2 == "p1-8", official_p18], 1, min, na.rm = TRUE)
  df$firstProcessing[df$status_L2 == "p1-8_OoB"] <- apply(df[df$status_L2 == "p1-8_OoB", official_p18_oob], 1, min, na.rm = TRUE)
  
  df$lastProcessing <- NA

  df$lastProcessing[df$status_L2 == "p1-7"]       <- apply(df[df$status_L2 == "p1-7", official_p17], 1, max, na.rm = TRUE)
  df$lastProcessing[df$status_L2 == "transfered"] <- apply(df[df$status_L2 == "transfered", transferedCodes], 1, max, na.rm = TRUE)
  df$lastProcessing[df$status_L2 == "p1-8"]       <- apply(df[df$status_L2 == "p1-8", official_p18], 1, max, na.rm = TRUE)
  df$lastProcessing[df$status_L2 == "p1-8_OoB"]   <- apply(df[df$status_L2 == "p1-8_OoB", official_p18_oob], 1, max, na.rm = TRUE)

  to_overrule <- list(c("29","FairbanksAK"),
              c("40","CharlesCityVA"),
              c("53","WestportCT"))

  for (i in 1:length(to_overrule)){
    tmp <- to_overrule[[i]]
    overrule_id <- which(meta$PanID == tmp[1] & meta$Location == tmp[2])
    df$status_L2[overrule_id] <- "p1-8"
  }

    
  
  res <- list("df"               = cbind(meta,df),
              "processingStatus" = summary(factor(df$status_L2,levels=c("transfered","p1-8","p1-7","todo"))),
              "evdcStatus"       = summary(factor(meta$status_EVDC,
                                                  levels=c("p1-7","p1-8","planned/inPreparation")))
              )
  
  return(res)
}


datasetGroups <- list(
  official_p17 = c("rnvs1p1-7", "rout0p1-7"),
  official_p18 = c("rfuh5p1-8", "rnvh3p1-8", "rout2p1-8","rwvt1p1-8","rnvs3p1-8", "rsus1p1-8", "rfus5p1-8"),
  official_p18_oob = c("rfuh5p1-8", "rnvh3p1-8", "rout2p1-8","rwvt1p1-8"),
  transferedCodes  = c("rnvs1p1-7", "rout0p1-7", "rnvs3p1-8", "rout2p1-8")
)

# make some exceptions of instruments that have not measured sky so it should not end on the todo list
to_overrule <- list(c("29","s1","FairbanksAK"),
                    c("40", "s1","CharlesCityVA"),
                    c("53","s1","WestportCT"))

# skip datasets which are uncalibratable
to_skip <- list(c("195","s1","Sapporo"))

for (i in 1:length(to_overrule)){
  tmp <- to_overrule[[i]]
  cat(sprintf("Overruling L2 status of %s to 'p1-8' \n",paste0(tmp,collapse = " ")))
  overrule_id <- which(officials$PanID == tmp[1] & officials$SpecID == tmp[2] & officials$Location == tmp[3])
  
  officials$status_L2[overrule_id] <- "p1-8"
}
Overruling L2 status of 29 s1 FairbanksAK to 'p1-8' 
Overruling L2 status of 40 s1 CharlesCityVA to 'p1-8' 
Overruling L2 status of 53 s1 WestportCT to 'p1-8' 
Code
# remove datasets from the todo list in general
all2skip <- list()
for (i in 1:length(to_skip)){
  tmp <- to_skip[[i]]
  cat(sprintf("Skipping dataset %s from the general todo list \n",paste0(tmp,collapse = " ")))
  overrule_id <- which(officials$PanID == tmp[1] & officials$SpecID == tmp[2] & officials$Location == tmp[3])
  all2skip[[i]] <- overrule_id  
}
Skipping dataset 195 s1 Sapporo from the general todo list 
Code
all2skip <- unlist(all2skip)
skipped_datasets <- officials[all2skip,]
officials        <- officials[-all2skip,] 


test <- getProcessingStatus(officials, datasetGroups,as.POSIXct("2023-03-30"))
test <- getProcessingStatus(officials, datasetGroups)


#for (i in 1:length(to_overrule)){
# tmp <- to_overrule[[i]]
# overrule_id <- which(test$df$PanID == tmp[1]& test$df$Location == tmp[3])
# test$df$status_L2[overrule_id] <- "p1-8"
#
#}


write.csv(test$df,file="PGN_datasetlist_official.csv")
write.csv(test$df,file=sprintf("PGN_datasetlist_official_%s.csv",as.character(Sys.Date())))
write.csv(test$df,file="/mnt/blick_storage/local/reports_html/PGN_datasetlist_official.csv")




# time based on the PGN certification
timelineID <- sort(c(unique(as.POSIXct(officials$PGNsince)),c(Sys.time())))
# time based on the occurance of each product
timelineID <- sort(na.omit(as.Date(unlist(productAvailabilityDF))))
timeline   <- lapply(timelineID,FUN=function(x) getProcessingStatus(officials, datasetGroups,as.POSIXct(x))$processingStatus)
timeline   <- myRbind(timeline)
timeline   <- as.data.frame(timeline)


pop2 <- cbind(timelineID,timeline)
write.csv(pop2,file="PGN_datasetProcessing.csv")
write.csv(pop2,file=sprintf("PGN_datasetProcessing_%s.csv",as.character(Sys.Date())))
write.csv(pop2,file="/mnt/blick_storage/local/reports_html/PGN_datasetProcessing.csv")

Product processing evolution

processor evolution

Code
#matplot(timelineID,timeline,type="l",lwd=2,lty=1,xlab="Time",ylab="Dataset status")
#legend("topleft",names(timeline),lty=1,lwd=2,bty="n",col=1:ncol(timeline))
Code
# Always relative to the latest processor version we would like to have
processed <- timeline$transfered+timeline$`p1-8`
backlog   <- timeline$`p1-7`+ timeline$todo
done      <- processed - backlog

timelines <- cbind(timeline$transfered,processed,
                   -timeline$`p1-7`,-backlog,
                   done)

data <- cbind(as.character(timelineID),timelines)






plotDatasetProcessing <- function(x,start=NULL,...){
  
  
  timelineID <- as.POSIXct(x[,1])
  timelines  <- x[,-1]
  progress <- x[,ncol(timelines)]
  
  
  # cutting 
  if (!is.null(start)){
    fltID <- which(timelineID > start)
    timelineID <- timelineID[fltID]
    timelines  <- timelines[fltID,]
  }
  
  
      cols <- c("lightgreen","darkgreen",
              "lightblue","blue",
              "red")
      
      
  matplot(timelineID,timelines,type="l",
          lty=c(1,1,1,1,1),col=cols,
          lwd=c(2,2,2,2,3),ylab="todo - Datasets - done",xlab="Time",...)
  abline(h=0,lwd=2,col="grey")
  legend("topleft",c("transfered to p1-8","p1-8","p1-7","todo","progress"),col=cols,
         lty=c(1,1,1,1,1),
         lwd=c(2,2,2,2,3),pch=NA)

}

plotDatasetProcessing(data)

Code
plotDatasetProcessing(data,start = as.POSIXct(Sys.Date()-364))

Code
plotDatasetProcessing(data,start = as.POSIXct("2024-09-01"))

rcode evolution

Code
prodtime <- function(x){
  time <- as.Date(na.omit(x))
  time <- sort(time)
  cs   <- seq_along(time)
  
  res <- data.frame("time"=time,
                    "cumsum"=cs)
}

dates <- na.omit(unlist(productAvailabilityDF))
dates <- as.Date(dates)

colors     <- qualitative_hcl(n = (ncol(productAvailabilityDF)), h = c(-154, 253), c = 46, l = 79, register = )


plot(as.Date("2019-01-01"),0,type="l",xlim=range(dates),ylim=c(0,300),xlab="Time",ylab="# L2 datasets")
for (i in 1:length(productAvailabilityDF)){
  tmp <- prodtime(productAvailabilityDF[,i])
  lines(tmp,col=colors[i],type="o",pch=i)
}
legend("topleft",names(productAvailabilityDF),col=colors,lty=1,pch=1:ncol(productAvailabilityDF),lwd=2,bty="n")

Code
par(mfrow=c(2,5),oma=c(0,0,0,0),mar=c(2,2,1,0))
for (i in 1:length(productAvailabilityDF)){
  tmp <- prodtime(productAvailabilityDF[,i])
  plot(as.Date("2019-01-01"),0,type="l",xlim=range(dates),ylim=c(0,300),xlab="Time",ylab="# L2 datasets")
  lines(tmp,col=colors[i],type="o",pch=i)
  abline(h=nrow(tmp))
  legend("topleft",names(productAvailabilityDF)[i],col=colors[i],pch=1:ncol(productAvailabilityDF),lty=1,lwd=2,bty="n")}

QA evolution

Code
getAllQAPeriods <- function(active=TRUE){

    qrystring <- "
    {allQAPeriods {
    instrument {
      panId
    }
    spectrometer
    location {
      name
    }
    rcodes
    active
    start
    end
    date
    user {
      lastName
    }
  }
}"
    qry <- Query$new()
    qry$query("getQA", qrystring)
    con <- GraphqlClient$new("https://hetzner.pandonia-global-network.org/query")
    raw <- con$exec(qry$queries$getQA)
    res <- jsonlite::fromJSON(raw)$data[[1]]
    
    # return only active periods
    res <- subset(res,active == TRUE)
}

qa_periods_raw <- getAllQAPeriods()  
qa_periods <- qa_periods_raw

Get the QA periods which are active:

Code
qa_days <- as.numeric(as.POSIXct(qa_periods$end,format="%Y-%m-%dT%H:%M:%S") -  as.POSIXct(qa_periods$start,format="%Y-%m-%dT%H:%M:%S"),
                      units="days")




qa_date   <- as.POSIXct(qa_periods$date,format="%Y-%m-%dT%H:%M:%S")

qa_progress <- zoo(cumsum(rep(1,length(qa_date))),sort(qa_date))
Warning in zoo(cumsum(rep(1, length(qa_date))), sort(qa_date)): some methods for
"zoo" objects do not work if the index entries in 'order.by' are not unique
Code
par(mfrow=c(2,1),oma=c(0,0,0,0),mar=c(4,4,2,1))
plot(qa_date,qa_days,type="h",lwd=1.5,xlab="Time[UTC]",ylab="QA period coverage [days]",main="QA actions",col=rgb(0,0,0,0.5))
plot(qa_progress,type="l",ylab="Cumulative QA actions",xlab="Time")

QA Staff

The following PI chart shows the distribution of available QA-periods.

Code
pie(summary(as.factor(qa_periods$user$lastName)),col=c("lightblue","grey","lightblue","lightgreen","lightgreen"))

QA status summary

Code
# go over datasets
qa_dataset_summary <- list()

for (od in 1:nrow(officials)){
  qa2check <- officials[od,]
  
  dataset <- sprintf("Pandora%d%s_%s",
                   qa2check$PanID,
                   qa2check$SpecID,
                   qa2check$Location)

  
  # check if there is any product even processed
  products <- officials[od,officialRcodes]
  
  
  dataset_l0 <- l0_inventory[[sprintf("Pandora%d%s",
                                      qa2check$PanID,
                                      qa2check$SpecID)]][[qa2check$Location]]
  
  # subset only QA periods for this datasets
  dataset_qa <- subset(qa_periods,instrument == qa2check$PanID & location == qa2check$Location)
  # if there is no QA period, set all to zero where we have L2 data
  
      # Initialize product summary
  product_summary <- data.frame(
    dataset = character(),
    product = character(),
    checked = integer(),
    missing = integer(),
    total = integer(),
    QA = as.Date(character()),
    operator = character()
  )
  
      
  # Extract L0 dates
  l0_dates <- as.POSIXct(unlist(dataset_l0$res_ts))

  if (nrow(dataset_qa) == 0){
    #cat("There is no QA period given.\n")
    for (product in names(products)){

        product_summary <- data.frame(
          dataset = dataset,
          product = product,
          checked = 0,
          missing = length(l0_dates),
          total = length(l0_dates),
          QA = NA,
          operator = NA
        )        
        qa_dataset_summary[[sprintf("%s_%s",
                                    dataset,
                                    product)]] <- product_summary  
    }
  }else{
    # ---------------------------------------------
    #cat("QA period found\n")
  
    # go over the products an look which codes has which QA period covered
    for (product in names(products)){

      
      # check the availbalbe qa periods for this gas
      product_qa <- grepl(product,dataset_qa$rcodes)
      
      #if (dataset == "Pandora20s1_Busan" & product == "rfus5p1-8") stop()
      if (any(product_qa)){
        #print("found product QA")
        # initialize
        covered_dates <- logical(length(l0_dates))
        
        # Loop through each QA period and check if l0_dates fall within the range
        for (i in which(product_qa)) {
          covered_dates <- covered_dates | (l0_dates >= dataset_qa$start[i] & l0_dates <= dataset_qa$end[i])
        }
        
        product_summary <- data.frame(
          dataset = dataset,
          product = product,
          checked = sum(covered_dates),
          missing = sum(!covered_dates),
          total = length(l0_dates),
          QA = sort(dataset_qa$date,decreasing = TRUE),
          operator = paste(unique(dataset_qa$user),collapse = ",")
        ) 
        qa_dataset_summary[[sprintf("%s_%s",
                                    dataset,
                                    product)]] <- product_summary          
        
      }else{
        product_summary <- data.frame(
          dataset = dataset,
          product = product,
          checked = 0,
          missing = length(l0_dates),
          total = length(l0_dates),
          QA = NA,
          operator = NA
        )        
        qa_dataset_summary[[sprintf("%s_%s",
                                    dataset,
                                    product)]] <- product_summary  
      }
      
    }

  }

}

df <- myRbind(qa_dataset_summary)
Code
# Summarize data
network_summary <- data.frame(
  Status = c("Checked", "Missing"),
  Count = c(sum(df$checked, na.rm = TRUE), sum(df$missing, na.rm = TRUE))
)

# Bar plot
barplot(
  network_summary$Count,
  names.arg = network_summary$Status,
  col = c("lightblue", "tomato"),
  main = "Network QA Overview ",
  ylab = "Count",
  xlab = "Status"
)

Code
# Pie chart
pie(
  network_summary$Count,
  labels = paste(network_summary$Status, "\n", round(100 * network_summary$Count / sum(network_summary$Count), 1), "%"),
  col = c("lightgreen", "red"),
  main = "Proportion of Checked vs Missing"
)

Code
# Loop through each unique product
unique_products <- unique(df$product)

#par(mfrow = c(ceiling(length(unique_products) / 2), 2)) # Arrange multiple plots in a grid
par(mfrow=c(3,3),oma=c(0,0,0,0),mar=c(1,1,1,1))
for (product in unique_products) {
  # Subset data for the product
  product_data <- df[df$product == product, ]
  
  # Calculate proportions
  values <- c(sum(product_data$checked, na.rm = TRUE), sum(product_data$missing, na.rm = TRUE))
  labels <- c("Checked", "Missing")
  
  # Create pie chart
  pie(
    values,
    labels = paste(labels, "\n", round(100 * values / sum(values), 1), "%"),
    col = c("lightgreen", "red"),
    main = product
  )
}

To be reported for POp-2 and FRM4AQ-2

This script is using the latest CalCoordination info based on BOPPO sheet extraction stored under /mnt/blick_storage/local/inventories/: - BOPPO_datasets.rda

So this baseline rda-file should be “upd2date”, which means it is generated every hour, so it should not be older than 1 hour.

After this report has been generated in the script directory /home/blick/rscripts-reports/ESA-reports there are the csv’s which need to be uploaded to the cloud bucket of the pop-report under data.

from ProcessingReport:

  • PGN_datasetlist_official.csv
  • PGN_datasetProcessing.csv

which is needed to produce the Figures “Dataset processing” and “EVDC dataset status”.

from CalReport:

  • CalReport_LabAnalysis.csv
  • CalReport_LabMeasurements.csv

which is needed to produce the Figures “Lab measurement” and “Calibration analysis”.

POp-2

For crosschecking with the PoP figure numbers we should see the following:

Code
summary(as.factor(officials$status_L2))
      p1-7       p1-8       todo transfered 
         4        143          6         80 

with a total of datasets on our list:

Code
sum(summary(as.factor(officials$status_L2)))
[1] 233

These numbers do not have to be entered somewhere but should reflect the summary numbers in the barplots and the total amount.Note that this number does not necessarily be in-line with the number of instruments. Because this is what we consider for the calibration team to be on the todo-list. This excludes non-certified datasets, datasets with less than 16 L0 files, and at the moment we also neglect the s2 for the reporting.

The following table lists all dataproduct-relevant columns

Code
cols2show <- c("PanID","SpecID","Location","PGNstatus","PGNsince","status_L2_new","status_L2","status_EVDC",officialRcodes)
datatable(officials[,cols2show])

Just for completeness, the following numbers are more granular with respect OoB availability (p1-8_OoB), incomplete data availability (incomplete), and where no processed data at all (nothing):

Code
summary(as.factor(officials$status_L2_new))
incomplete    nothing       p1-7       p1-8   p1-8_OoB transfered 
         5          4          4         92         48         80 

These numbers can also be linked to the numbers reported in the POp, where all the incompletes and nothings go into todo, and the p-18_OoB go into the p1-8.

And this should be the EVDC summary

Code
summary(as.factor(officials$status_EVDC))
                 p1-7                  p1-8 planned/inPreparation 
                    6                    90                   137 

which should again be a total in the EVDC plot of:

Code
sum(summary(as.factor(officials$status_EVDC)))
[1] 233

FRM4AQ-2

In the D10 table: https://docs.google.com/spreadsheets/d/1I1gAJ1H8OcnFnx7rIePDBCCLfRMrosSOS-V4lmPJdbA/edit#gid=597447878

in the column ‘#PGN’ we list the number of available processed datasets:

Code
d10 <- apply(!is.na(officials[,officialRcodes]),2,sum)
#print(d10)

d10["rnvs1p1-7"] <- sum(na.omit(officials$status_L2 == "p1-7"))
d10["rout0p1-7"] <- sum(na.omit(officials$status_L2 == "p1-7"))

d10_pgn <- d10

datatable(subset(officials,status_L2 == "p1-7"))

Further, we also list what is available on EVDC, and cleared in BOPPO (upload request done) from PGN site. Note that the number of p1-7 will go down since it is replaced by a new processor version. Discrepancies are expected to be observed among the dataproducts, since only the “best” goes to EVDC.

Code
rnvs1 <- sum(grepl(officials$EVDC_NO2,pattern="rnvs1p1-7"))
rout0 <- sum(grepl(officials$EVDC_O3,pattern="rout0p1-7"))

rnvs3 <- summary(factor(officials$EVDC_rnvs3,levels=c("cleared","planned")))[1]
rout2 <- summary(factor(officials$EVDC_rout2,levels=c("cleared","planned")))[1]
rfus5 <- summary(factor(officials$EVDC_rfus5,levels=c("cleared","planned")))[1]
rsus1 <- summary(factor(officials$EVDC_rsus1,levels=c("cleared","planned")))[1]
rwvt1 <- summary(factor(officials$EVDC_rwvt1,levels=c("cleared","planned")))[1]
rfuh5 <- summary(factor(officials$EVDC_rfuh5_HCHO,levels=c("cleared","planned")))[1]
rnvh3_NO2 <- summary(factor(officials$EVDC_rnvh3_NO2,levels=c("cleared","planned")))[1]
rnvh3_H2O <- summary(factor(officials$EVDC_rnvh3_H2O,levels=c("cleared","planned")))[1]

d10_evdc <- cbind(rnvs1,rout0,rfuh5,rnvh3_NO2,rnvh3_H2O,rout2,rwvt1,rnvs3,rsus1,rfus5)

datatable(subset(officials,EVDC_NO2 != "rnvs1p1-7"))
Code
print(d10_pgn)
rnvs1p1-7 rout0p1-7 rfuh5p1-8 rnvh3p1-8 rout2p1-8 rwvt1p1-8 rnvs3p1-8 rsus1p1-8 
        4         4       220       220       225       225       176       176 
rfus5p1-8 
      176 
Code
print(d10_evdc)
        rnvs1 rout0 rfuh5 rnvh3_NO2 rnvh3_H2O rout2 rwvt1 rnvs3 rsus1 rfus5
cleared     6     0    72        80         0    56     0    90    65    68

Not to be reported, can comparing what is on EVDC with what we have, we can derive some “rejection” rate. Since almost all dataset end on EVDC for NO2, this number is the benchmark.e:

Code
evdc       <- d10_evdc[,c("rnvs1","rout0","rfuh5","rnvh3_NO2","rout2","rwvt1","rnvs3","rsus1","rfus5")]
pgn       <- d10_pgn[c("rnvs1p1-7","rout0p1-7","rfuh5p1-8","rnvh3p1-8","rout2p1-8","rwvt1p1-8","rnvs3p1-8","rsus1p1-8","rfus5p1-8")]

#evdc      <- c(58,37,50,46,0,0,0)
#evdc_ideal<- c(58,58,58,58,0,0,0)
evdc_ideal <- rep(pgn["rnvs3p1-8"],length(pgn))

EVDC_numbers <- data.frame(pgn,evdc,evdc_ideal,row.names = names(pgn))
EVDC_numbers$evdc_missing  <- EVDC_numbers$evdc_ideal - EVDC_numbers$evdc
EVDC_numbers$evdc_rejected <- abs(EVDC_numbers$evdc - EVDC_numbers["rnvs3p1-8","evdc"]) 
EVDC_numbers$evdc_rejectedRate <- round(EVDC_numbers$evdc_rejected/EVDC_numbers["rnvs3p1-8","evdc"],2)*100
EVDC_numbers$evdc_backlog   <- round((EVDC_numbers$pgn - EVDC_numbers$evdc_rejected - EVDC_numbers$evdc)/(EVDC_numbers$pgn-EVDC_numbers$evdc_rejected),2)*100

print(EVDC_numbers[,1:2])
          pgn evdc
rnvs1p1-7   4    6
rout0p1-7   4    0
rfuh5p1-8 220   72
rnvh3p1-8 220   80
rout2p1-8 225   56
rwvt1p1-8 225    0
rnvs3p1-8 176   90
rsus1p1-8 176   65
rfus5p1-8 176   68
Code
#sum(EVDC_numbers$evdc[1:4])/sum(EVDC_numbers$pgn[1:4])
#sum(EVDC_numbers$evdc_ideal[1:4])/sum(EVDC_numbers$pgn[1:4])

Rejection rate for p1-8 data

Code
p18_stats <- EVDC_numbers[-c(1,2),]
# order by amount on evdc
p18_stats <- p18_stats[order(p18_stats$evdc,decreasing = T),]
print(p18_stats)
          pgn evdc evdc_ideal evdc_missing evdc_rejected evdc_rejectedRate
rnvs3p1-8 176   90        176           86             0                 0
rnvh3p1-8 220   80        176           96            10                11
rfuh5p1-8 220   72        176          104            18                20
rfus5p1-8 176   68        176          108            22                24
rsus1p1-8 176   65        176          111            25                28
rout2p1-8 225   56        176          120            34                38
rwvt1p1-8 225    0        176          176            90               100
          evdc_backlog
rnvs3p1-8           49
rnvh3p1-8           62
rfuh5p1-8           64
rfus5p1-8           56
rsus1p1-8           57
rout2p1-8           71
rwvt1p1-8          100