R/coordinate_extractors.R

#' @export
#extract coordinates
coordinate_extractor = function(s){
  string = gsub("[\r\n]", " ", string)
  coordinates = str_extract_all(string, '(?<=<named-content content-type="dwc:verbatimCoordinates">)(.*?)(?=</named-content>)')
  if (length(unlist(coordinates)) > 0){
  cord = unlist(coordinates)
  extr_coordinates = list()
  for (c in 1:length(cord)){
    # check whether coordinates are saved as geo-json
    if (str_detect(cord[c], "geo-json") == TRUE)
    {
      m = str_extract(cord[c], "(?<=\\[)(.*)(?=])")
      #change positions of latitude and longitude because in geo-json they are reversed
      split = unlist(strsplit(m, ","))
      
      extr_coordinates[[c]] = paste0(split[2],sep=",",split[1])
    }
    else{
      extr_coordinates[[c]] = cord[c]
    }
  }
  }
else
{
  extr_coordinates=NULL
}
  return(extr_coordinates)
}

#' @export
coordinate_cleaner = function(coord, coordinate_list=list()){
  #remove degree character if present
  coord = gsub("°", ".", coord)
  #remove any letter characters (N,E) if present
  coord = gsub("[a-zA-Z]+", "", coord)
  #remove ' character if present
  coord = gsub("(?<=([0-9]))(')(?=([0-9]))", "", coord, perl = TRUE)
  #remove \" character combination if present
  coord = gsub('\"','',coord)
  #remove " if present
  coord = gsub('"', '', coord)
  #remove any whitespace if present
  coord = gsub(' ', '', coord)
  
  #split string into latitude and longitude
  arr = strsplit(coord,",")[[1]]
  lat = arr[1]
  long = arr[2]
  
  #if the latitude and longitude contain a second dot remove it
  if((str_count(lat,"\\.")>1) && (str_count(long,"\\.")>1)){
    #remove second dot
    lat = toString(lat)
    long = toString(long)
    s_lat = unlist(strsplit(lat,"\\."))
    s_long = unlist(strsplit(long,"\\."))
    temp_lat = paste(s_lat[1], s_lat[2], sep=".")
    temp_long = paste(s_long[1], s_long[2], sep=".")
    
    lat = paste(temp_lat,s_lat[3],sep="")
    long = paste(temp_long,s_long[3],sep="")
  }
  
  coordinate_list[[1]] = c(lat, long)
  
  return(coordinate_list[1])
}


#' @export
null_to_na = function(x){
  if(is.null(x)){
    x = "NA"
  }
  return(x)
}

#' @export
reverse_geo = function(coordinate_list, id_list = c(), name_list = c(),latitude_list = c(), longitude_list=c(),town_list=c(),region_list=c(), country_list=c(), country_code_list=c(), massive_list=c()){
  #get the lat and long from the list
  list = unlist(coordinate_list)
  lat=list[1]
  long=list[2]
  
  #construct httr request and send it
  request = paste0("https://nominatim.openstreetmap.org/reverse?format=json&lat=",lat,"&lon=",long,"&zoom=18&addressdetails=1")
  result=GET(request)
  
  #extract from results
  osm_id = null_to_na(content(result)$osm_id)
  name = null_to_na(content(result)$display_name)
  country = null_to_na(content(result)$address$country)
  country_code = null_to_na(content(result)$address$country_code)
  region =  null_to_na(content(result)$address$region)
  town =  null_to_na(content(result)$address$town)
  
  #save to lists  (to build df)
  id_list = c(id_list, osm_id)
  name_list = c(name_list, name)
  town_list = c(town_list, town)
  region_list = c(region_list, region)
  country_list = c(country_list, country)
  country_code_list = c(country_code_list,country_code)
  latitude_list = c(latitude_list, lat)
  longitude_list = c(longitude_list, long)
  
  #id_list[1] = osm_id
  #name_list[1] = name
  #town_list[1] = town
  #region_list[1] = region
  #country_list[1] = country
  #country_code_list[1] = country_code
  #latitude_list[1] = lat
  #longitude_list[1] = long
  massive_list = c(id_list,name_list,latitude_list, longitude_list,town_list,region_list, country_list, country_code_list)
  return(massive_list)
}

#' @export
#coordinates the extraction of coordinates, requesting of openstreetmap and building of dataframe with results
coordinator = function(string){
  
  coordinates = coordinate_extractor(string)
  if (is.null(coordinates)){
    return(NULL)
  }
  else{
  #cleaning of coordinates and request writing
  clean_coordinates = sapply(coordinates, coordinate_cleaner)
  
  
  l = sapply(clean_coordinates, reverse_geo)
  
  #building data frame
  #df = as.data.frame(t(as.data.frame(unlist(l))), stringsAsFactors = FALSE)
  df = as.data.frame(t(l))
  names(df) = c("id","name", "latitude","longitude","town", "region","country","country_code")
  rownames(df) = NULL
  #remove rows with duplicate ids to conform to primary key uniqueness rule
  df = df[!duplicated(df$id),]
  return(df)
  }
}
mariyad/openbiodiving documentation built on June 3, 2019, 2:18 p.m.