R/wch_html2df.R

Defines functions read_a_wch_htm read_dir_wch_htm clean_lab grab_a_field wch_html2df

Documented in wch_html2df

#' convert a wch report file (html) to data.frame
#'
#' will convert the fields in a standard mk10 wch report file (html) or a directory of such files generated by mk10 host into a \code{data.frame} with fields as columns and ready for export into a csv.
#' @details this is a kludge. surely there is a better way to do it... there is weird formatting and typos in the html which ever fixed will break this code. you have been warned.
#' @param file a single wch report file (html) to convert
#' @param dir a dirctory of wch report files (html) to convert.
#' @return a \code{data.frame}. if multiple input files than each row is a new file. columns correspond to fields.
#' @export

wch_html2df <- function(dir, file) {
	if(!missing(file) & !missing(dir)) stop("you gave me both a single file and a dir...")
	if(missing(file) & missing(dir)) stop("you didn't give me either a single file or a dir...")
	
	if(missing(dir)) {
		if(!file.exists(file)) stop("i can't find that file...")
		out <- read_a_wch_htm(file)
	} else if(missing(file)) {
		if(!dir.exists(dir)) stop("i can't find that dir...")
		out <- read_dir_wch_htm(dir)
	}
	
	out
}

# constants
SIMPLE_FIELDS <- c(
	# host settings
	"MK10Host version",
	"User Name",
# time and date settings
	# missing
# General Settings
	# "Argos Ptt number", # complex field
	"Repetition Intervals",
	"Number of Argos transmissions",
	"Tagware version",
	"Hardware version",
	"Battery Configuration",
	"Battery Capacity",
	"Deploy from Standby on Depth Change?",
# these are only sometimes there and sometimes there are others
	# "Bytes of archive data collected",
	# "Bytes of histogram and profile data collected",
# Data to Archive Settings
	"Depth$",
	"Internal Temperature$",
	"External Temperature$",
	"Depth Sensor Temperature$",
	"Battery Voltage$",
	"Wet/Dry$",
	"Wet/Dry Threshold",
	"Sampling Mode",
	"Automatic Correction of Depth Transducer Drift",
# Data to transmit settings
# histogram selection
	"Histogram Data sampling interval",
	"Dive Maximum Depth \\(m\\)",
	"Dive Duration",
	"Time-at-Temperature \\(C\\)",
	"Time-at-Depth \\(m\\)",
	"20-min time-line",
	"Hourly % time-line \\(low resolution\\)",
	"Hourly % time-line \\(high resolution\\)",
	"Dry/Deep/Neither time-lines",
	"PAT-style depth-temperature profiles",
	"Light-level locations",
# histogram collection
	"Hours of data summarized in each histogram",
	"Histograms start at GMT",
	"Do not create new Histogram-style messages if a tag is continuously",
# time-series Messages
	"Generation of time-series messages",
  "Time interval between TS samples",
  "Channels sampled",
  "Start with",
  "then Duty Cyle with", # this is a typo in the html report... will they fix it? this code will break then
  "^and$",
# dive and timeline defintion
	"Depth reading to determine start and end of dive",
	"Ignore dives shallower than",
	"Ignore dives shorter than",
	"Depth threshold for timelines",
# behavior messages
	"Generation of behavior messages",
# stomach temperature messages
	"Generation of stomach temperature messages",
#Haulout Defintion
  "A minute is &quot;dry&quot; if Wet/Dry sensor is dry for any <span class=\"style10\">value</span> seconds in a minute",
  "Enter haulout state after <span class=\"style10\">value</span> consecutive dry minutes",
  "Exit haulout state if wet for any <span class=\"style10\">value</span> seconds in a minute",
  "Pause transmissions if haulout exceeds",
  "Transmit every eighth day if transmissions are paused",
	# transmission control
	"Transmit data collected over these last days",
#Collection Days
	# "January",
	# "February",
	# "March",
	# "April",
	# "May",
	# "June",
	# "July",
	# "August",
	# "September",
	# "October",
	# "November",
	# "December",
# relative transmit priorities
	"Histogram, Profiles, Time-lines, Stomach Temperature",
	"Fastloc and Light-level Locations",
	"Behavior and Time-Series",
	"Status",
# when to transmit
	"Initially transmit for these hours regardless of settings below",
	"Transmit hours",
# transmit days
	# same as collection days above will have to parse it there
# daily transmit allowance
	# same as collection days above will have to parse it there
# channel settings
	"<strong>Depth</strong>"
)

HISTO_FIELDS <- c(
  "Dive Maximum Depth \\(m\\)",
  "Dive Duration",
  "Time-at-Temperature \\(C\\)",
  "Time-at-Depth \\(m\\)",
  "20-min time-line",
  "Hourly % time-line \\(low resolution\\)",
  "Hourly % time-line \\(high resolution\\)",
  "Dry/Deep/Neither time-lines",
  "PAT-style depth-temperature profiles",
  "Light-level locations"
)

FASTLOC_FIELDS <- c(
# fastloc settings
	"Fastloc sampling interval",
	"Deployment Latitude",
	"Deployment Longitude",
	"Deployment Altitude",
	"Transmit hours", # second one
#Fastloc Collection Days
	# months (first one)
#Fastloc Control
	"Maximum successful Fastloc attempts",
	"Maximum failed Fastloc attempts",
	"Overall maximum Fastloc attempts",
	"Supress Fastloc after good haulout location",
	"Abort Fastloc processing if tag measures wet on next wakeup",
	"Always transmit latest location message",
#Fastloc Internal Parameters
	"^Mode$",
	"^Offset$",
	"^Span$",
	"^Autotune$",
	"^Threshold$",
	"^Hardware$",
	"^Software$",
	"^Serial Number$"
)

MONTHS <- c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")

# tiny helper function
grab_a_field <- function(fieldname, htm_split, cleanlab = TRUE, cleanval = TRUE) {
	lab_ind <- grep(fieldname, htm_split)
	val_ind <- lab_ind + 1
	
	val_tmp <- htm_split[val_ind]
	val_tmp <- trimws(gsub("&nbsp", "", val_tmp))

	lab_tmp <- fieldname
	
	if(length(lab_ind) > 0) {
	  lab_tmp <- htm_split[lab_ind]
	}
	
	if(cleanval) val_tmp <- gsub(",", ";", val_tmp)
	if(cleanlab) lab_tmp <- clean_lab(lab_tmp)
	
	list(val = val_tmp, lab = lab_tmp)
}

# tinier helper function
clean_lab <- function(lab) {
  lab <- gsub("<span class=\"style10\">", "", lab)
  lab <- gsub("</span>", "", lab)
  lab <- gsub("&quot;", "", lab)
	lab <- gsub("'", "", lab)
	lab <- gsub(",", "", lab)
	lab <- gsub(" ", "_", lab)
	lab <- gsub("-", "_", lab)
	lab <- gsub("\\^", "", lab)
	lab <- gsub("\\$", "", lab)
	lab <- gsub("\\\\\\(", "(", lab)
	lab <- gsub("\\\\\\)", ")", lab)
	lab
}

# read an entire directory
read_dir_wch_htm <- function(dir) {
	allfiles <- list.files(dir)
	files <- allfiles[grepl("*.htm$", allfiles)]
	
	if(length(files) < 1) stop("i can't find any htm files in that dir...")
	
	df <- list()
	for(i in 1:length(files)) {
		df[[i]] <- read_a_wch_htm(file.path(dir, files[i]))
	}
	
	do.call('rbind', df)
}

# read a single wch report htm file
read_a_wch_htm <- function(file) {
	htm_lines <- readLines(file)
	htm <- paste(htm_lines, collapse = "")
	htm_split <- strsplit(htm, split = "<td>|</td>|</tr>")[[1]]
	
	isfastloc <- any(grepl("Fastloc Settings", htm_split))

	# start out and put the filename in the first column
	out <- cbind(basename(file))
	colnames(out) <- "wch_filename"

	# get the tags serial number
	f0 <- grab_a_field("Tag\'s Serial Number", htm_split)
	out <- cbind(out, "Tags_Serial_Number" = f0$val)
		
	# get the ptt numbers
	f1 <- grab_a_field("Argos Ptt number", htm_split)
	val <- strsplit(f1$val, split = "\\(|\\)| ")[[1]]
	out <- cbind(out, "PTT" = val[1], "PTT_hex" = val[4], "LUT" = val[11])
	
	# do the simple fields
	for(i in 1:length(SIMPLE_FIELDS)) {
		ff <- grab_a_field(SIMPLE_FIELDS[i], htm_split)
		
		if(SIMPLE_FIELDS[i] == "Wet/Dry$") {
			outtmp <- cbind("wet_dry_archive_period" = ff$val[1])
		} else if(SIMPLE_FIELDS[i] == "<strong>Depth</strong>") {
			val <- strsplit(grab_a_field(SIMPLE_FIELDS[i], htm_split, cleanval = FALSE)$val, split = ": |;")[[1]]
			outtmp <- cbind("depth_channel" = val[2], "depth_range" = val[4], "depth_resolution" = val[6], "depth_ADaddress" = val[8], "depth_settling_delay" = val[10])	
		} else if(SIMPLE_FIELDS[i] == "Transmit hours") {
			outtmp <- cbind(ff$val[length(ff$val)])
			colnames(outtmp) <- ff$lab[length(ff$lab)]
		} else if(SIMPLE_FIELDS[i] == "Generation of time-series messages") {
		  if(length(ff$val) == 0) {
		    lab <- ff$lab
		    ff<- grab_a_field("Generation of time-series \\(TS\\) messages", htm_split)
		    ff$lab <- lab
		  }
		  outtmp <- cbind(ff$val[length(ff$val)])
		  colnames(outtmp) <- ff$lab[length(ff$lab)]
		} else if(SIMPLE_FIELDS[i]  %in% HISTO_FIELDS) {
		  outtmp <- cbind(ff$val[1])
		  colnames(outtmp) <- clean_lab(SIMPLE_FIELDS[i])
		} else {
			outtmp <- cbind(ff$val[1])
			colnames(outtmp) <- ff$lab[1]
		}
		
		out <- cbind(out, outtmp)
	}
	
	# add fastloc particular fields or make NA
	for(i in 1:length(FASTLOC_FIELDS)) {
		if(isfastloc) {
			ff <- grab_a_field(FASTLOC_FIELDS[i], htm_split)
		} else {
			ff <- list(lab = clean_lab(FASTLOC_FIELDS[i]), val = NA)
		}
			
		if(FASTLOC_FIELDS[i] == "Transmit hours") {
			ff$lab <- paste("Fastloc", ff$lab[1], sep = "_")
			ff$val <- ff$val[1]
		} else if(FASTLOC_FIELDS[i] %in% FASTLOC_FIELDS[12:19]) {
			ff$lab <- paste("Fastloc", ff$lab, sep = "_")
		}
		
		outtmp <- cbind(ff$val[1])
		colnames(outtmp) <- ff$lab[1]
		out <- cbind(out, outtmp)
	}
	
	# add collection transmit and transmit allowance
	monthout <- cbind()
	for(i in 1:length(MONTHS)) {
		if(isfastloc) {
			val <- grab_a_field(MONTHS[i], htm_split)$val
		} else {
			val <- c(NA, grab_a_field(MONTHS[i], htm_split)$val)
		}
		
		outtmp <- cbind(val[1], val[2], val[3], val[4])
		colnames(outtmp) <- paste0(c("fastloc_collection_days_", "collection_days_", "transmit_days_", "daily_transmit_allowance_"), tolower(MONTHS[i]))
		monthout <- cbind(monthout, outtmp)
	}
	
		oo <- c(seq(1, ncol(monthout), by = 4), seq(2, ncol(monthout), by = 4), seq(3, ncol(monthout), by = 4), seq(4, ncol(monthout), by = 4))
		
	out <- cbind(out, monthout[, oo, drop = FALSE])
	out <- as.data.frame(out, stringsAsFactors = FALSE)
  out
}
 
williamcioffi/sattagutils documentation built on June 3, 2022, 10:21 a.m.