bottles.db = function( DS, p, plotdata=FALSE ) {
# Gordana Lazin, 8 April 2015, first version
# data view of nutrients and chl from biochem db
bottles.dir = project.datadirectory("aegis", "biochem", "bottles")
bottles.data.dir = file.path( bottles.dir, "data" )
bottles.datadump.dir = file.path( bottles.dir, "data", "datadump" )
if ( DS == "bottles.dump.rawdata" ) {
if (FALSE) {
"
# Shelley's BioChem query for nutrients and chl
create or replace view climatology as (
SELECT m.data_center_code, m.name, m.descriptor mission, m.start_date, m.end_date,
e.collector_event_ID,e.collector_station_name,
h.start_date header_start,
e.start_date event_start,
h.start_time header_start_time,
e.start_time event_start_time,
h.end_date header_end,
e.end_date event_end,
h.end_time header_end_time,
e.end_time event_end_time,
h.time_qc_code,
h.start_lat header_start_lat,
h.start_lon header_start_lon,
h.end_lat header_end_lat,
h.end_lon header_end_lon,
h.position_qc_code,
h.start_depth header_start_depth,
h.end_depth header_end_depth,
h.sounding header_sounding,
d.collector_sample_id sample_id,
d.data_type_seq,
dt.data_retrieval_seq,
dt.method,
dt.description data_type,
dt.priority,
d.data_value data_value,
u.name unit_for_data,
d.data_qc_code,
d.averaged_data
FROM biochem.bcdiscretedtails d, biochem.bcdiscretehedrs h,biochem.bcevents e,
biochem.bcmissions m, biochem.bcdatatypes dt, bcunits u,
bcdataretrievals dr
WHERE d.discrete_seq=h.discrete_seq
AND h.event_seq=e.event_seq
and d.data_type_seq=dt.data_type_seq
AND e.mission_seq=m.mission_seq
AND dt.unit_seq=u.unit_seq
and (upper(dr.parameter_description)= 'CHLOROPHYLL A'
or upper(dr.parameter_description) like '%SILICATE%'
or upper(dr.parameter_description) like '%NITRATE%'
OR (upper(dr.parameter_description) like '%PHOSPH%' and method like 'PO4%'))
AND dt.data_retrieval_seq = dr.data_retrieval_seq
and (h.start_lat BETWEEN 37 AND 48 AND h.start_lon BETWEEN -71 AND -48));
# QA/QC done locally
#and upper(dr.parameter_description) not like '%UPTAKE%'
#and to_number(to_char (h.start_date, 'YYYY')) BETWEEN to_number(to_char (m.end_date,'YYYY'))
#and to_number(to_char (m.start_date,'YYYY'))
#and to_char(e.start_date, 'DD-MON-YYYY') = to_char(h.start_date, 'DD-MON-YYYY');
#and h.position_qc_code in (0,1,5)
#and d.data_qc_code in (0,1,5)
#and d.data_value >0
Shelley: The script which pulls your data is
select * from climatology
where
to_number(to_char (header_start, 'YYYY')) BETWEEN to_number(to_char (end_date,'YYYY'))
and to_number(to_char (start_date,'YYYY'))
and to_char(event_start, 'DD-MON-YYYY') = to_char(header_start, 'DD-MON-YYYY');
"
}
dir.create( bottles.datadump.dir, recursive=TRUE, showWarnings=FALSE )
con = ROracle::dbConnect( DBI::dbDriver("Oracle"), dbname=oracle.biochem.server, username=oracle.biochem.user, password=oracle.biochem.password, believeNRows=F)
for ( YR in p$bottles.years ) {
query = paste(
"select * from biochem.climatology where to_number(to_char (header_start, 'YYYY'))
BETWEEN to_number(to_char (end_date,'YYYY'))
and to_number(to_char (start_date,'YYYY'))
and to_char(event_start, 'DD-MON-YYYY') = to_char(header_start, 'DD-MON-YYYY')
and to_number(to_char (start_date,'YYYY')) =", YR )
fn = file.path( bottles.datadump.dir, paste( "bottles.dump.rawdata", YR, ".rdata", sep="") )
res = NULL
res = ROracle::dbGetQuery(con, query )
save(res, file=fn, compress=TRUE )
print(YR)
}
ROracle::dbDisconnect(con)
}
# --------------------------------
if ( DS == "bottles.rawdata.all.redo" ) {
fn.all = file.path( bottles.datadump.dir, paste( "bottles.dump.rawdata", "all_years", "rdata", sep=".") )
out = NULL
for ( YR in p$bottles.years ) {
fn = file.path( bottles.datadump.dir, paste( "bottles.dump.rawdata", YR, ".rdata", sep="") )
res = NULL
if (file.exists(fn)) load( fn )
out = rbind( out, res )
}
save(out, file=fn.all, compress=TRUE)
}
# --------------------------------
if ( DS == "bottles.rawdata.all" ) {
fn.all = file.path( bottles.datadump.dir, paste( "bottles.dump.rawdata", "all_years", "rdata", sep=".") )
out= NULL
if (file.exists(fn.all)) load(fn.all)
return( out)
}
# --------------------------------
if ( DS %in% c("bottles.qa.qc", "bottles.qa.qc.redo") ) {
fn = file.path( bottles.data.dir, paste( "bottles.qa.qc", ".rdata", sep="") )
if ( DS == "bottles.qa.qc" ) {
out = NULL
if (file.exists(fn)) load(fn)
return(out)
}
nc = bottles.db( DS="bottles.rawdata.all", p=p )
# data filtering is in plot_chl_nut.r
names(nc)=tolower(names(nc))
# replace factor with characters
i = sapply(nc, is.factor)
nc[i] <- lapply(nc[i], as.character)
# === Data filtering starts here ====
# find flagged data (2 inconsistent, 3 doubtful, 4 erroneous)
flagged=which(nc$position_qc_code %in% c(2,3,4) | nc$data_qc_code %in% c(2,3,4))
# find data less than 0
neg=which(nc$data_value<0)
# find methods that have "Uptake" in method field
uptake=grep("Uptake",nc$method)
# make a list of missions with bad records for each parameter
bm=list(bmc=c("OC7908","32G879008"),bmp=c("18HU88026"),bms=c("18HU167005","31TR26870","180167005"))
# find indices for bad missions for each parameter (no bad missions for nitrate)
bc=intersect(grep("Chl", nc$method), which(nc$mission %in% bm$bmc) )
bp=intersect(grep("PO4", nc$method), which(nc$mission %in% bm$bmp) )
bs=intersect(grep("SiO4", nc$method), which(nc$mission %in% bm$bms) )
# find duplicated records. Create uid and uid1 to spot duplicates
nc$uid=paste(nc$mission,"_",nc$collector_event_id,"_",nc$sample_id,sep="")
nc$uid1=paste(nc$uid,"_", nc$method, sep="")
dup=which(duplicated(nc$uid1)) # duplicated records
# start and end depth have to be the same. if not remove those records
ddif=nc$header_start_depth-nc$header_end_depth
different_depths=which(ddif !=0)
# remove suspect cast from mission CHA7910
ll=which(out$mission=="CHA7910" & out$param=="chl" & out$collector_event_id==50)
# remove identified records and create filtered nutrient-chlorophyll dataset ncf
ncf=nc[-unique(c( flagged,neg,uptake,bc,bp,bs,dup,different_depths,ll)) ,]
#rm(nc)
# ==== flag coastal and ocean data ====
# load polygons 5km away from coastline (dataframe is c5kf)
fnc5kf=file.path( bottles.data.dir, "coast5km.polygons.4filtering.rdata")
load(fnc5kf) # loads c5kf (c)
# add coastFlag field (1 for open ocean, 2 for coastal ocean)
# set coastFlag to 1 for all records
ncf$coastFlag=1
no_poly=length(unique(c5kf$flag))
library(sp)
# find points in polygons. These are within 5km off coast; flag those as 2
for (i in 1:no_poly) {
pol=which(c5kf$flag==i)
a = which( point.in.polygon( ncf$header_start_lon, ncf$header_start_lat,
c5kf$lon[pol], c5kf$lat[pol]) != 0 )
ncf$coastFlag[a]=2
}
# === ncf$coastalFlag is 1 for ocean data, and 2 for coastal data ===
# add param column with values chl,nit, pho, sil
# so it will be easier to separate the parameters
ncf$param=NA
ncf$param[grep("Chl",ncf$method)]="chl"
ncf$param[grep("NO3",ncf$method)]="nit"
ncf$param[grep("PO4",ncf$method)]="pho"
ncf$param[grep("SiO4",ncf$method)]="sil"
# Data filtering based on IML bottle data quality control described on:
# http://slgo.ca/app-sgdo/en/docs_reference/botl_odf_quality.html
# Defines acceptable range of values for NW Atlantic for nutrients and chl, valid for open ocean only
# filter ocean data only according to IML range limits for NW Atlantic (IML Test 2.1)
chlF=which(ncf$param=="chl" & (ncf$data_value<0 | ncf$data_value>50) & ncf$coastFlag==1)
nitF=which(ncf$param=="nit" & (ncf$data_value<0 | ncf$data_value>515) & ncf$coastFlag==1)
phoF=which(ncf$param=="pho" & (ncf$data_value<0 | ncf$data_value>4.5) & ncf$coastFlag==1)
silF=which(ncf$param=="sil" & (ncf$data_value<0 | ncf$data_value>250) & ncf$coastFlag==1)
# IML Test 2.4 -- deep values for silicate and phosphate cannot be below 0.01
dsil=which(ncf$param=="sil" & (ncf$data_value<0.01 | ncf$data_value>250)
& ncf$coastFlag==1
& (ncf$header_start_depth>=150 & ncf$header_start_depth<=900))
dpho=which(ncf$param=="pho" & (ncf$data_value<0.01 | ncf$data_value>4.5)
& ncf$coastFlag==1
& (ncf$header_start_depth>=150 & ncf$header_start_depth<=1500))
# indices of all the records that do not pass IML Test 2.1 and 2.4 (out of range)
out_of_range=unique(c(chlF,nitF,phoF,silF,dsil,dpho))
# remove out of range records
ncf=ncf[-out_of_range,]
# add month and year
require(lubridate)
ncf$month=month(ncf$header_start)
ncf$year=year(ncf$header_start)
# ===== look for overlaps between the methods =====
# in this dataset there are overlapps only for chlorophyll
# define which parameter will be investigated
df=ncf[which(ncf$param=="chl"),]
# c contains unique methods
c=unique(df$method)
l=length(c)
# check for overlaps two methods at the time
# have to consider all combination of the methods
# install.packages("combinat")
require(combinat)
# p contains all combinations of two methods
# i.e. method1 vs method2, method1 vs method3, method2 vs method3 etc.
p=combn(l,2)
no_combinations=dim(p)[2]
# initiate lists containing number of overlaps and overlapping uid
no_overlaps=NULL
overlapping_uid=list()
# find overlaping sample id's for each method and store them in the list
# if there are any overlaps plott scatter plot, otherwise don't do anything
for (i in 1:no_combinations) {
# define which methods are compared
method1=c[p[1,i]]
method2=c[p[2,i]]
# overlapping samples
o=intersect(df$uid[which(df$method==method1)],df$uid[which(df$method==method2)])
# lists that contains overlapping uid and number of overlaps
overlapping_uid[[i]]=o
no_overlaps[i]=length(o)
# if there are overlaps plot the scatter plot
if (plotdata) {
if (length(o)>0) {
# indices of overlapping samples
im1=which(df$method==method1 & df$uid %in% o)
im2=which(df$method==method2 & df$uid %in% o)
# uid are not in the same order. Need to merge based on uid
dfm=merge(df[im1,],df[im2,],by="uid")
# limit for the axis so x and y axis have the same range
axlim=c(0,ceiling(max(c(dfm$data_value.x,dfm$data_value.y)))+1)
# plot scatter plot for two methods
plot(dfm$data_value.x,dfm$data_value.y,xlab=method1, ylab=method2,
xlim=axlim, ylim=axlim, col="dodgerblue",
main=paste("Number of overlapping samples:",length(o)), cex.main=0.9)
abline(0,1)
ll=lm(dfm$data_value.y ~ dfm$data_value.x)
abline(ll, col="red",lwd=2)
# get r2, intercept and slope from the linear regression
r2=formatC(summary(ll)$r.squared, format="f", digits=3)
intc=formatC(coef(ll)["(Intercept)"], format="f", digits=3)
sl=format(coef(ll)["dfm$data_value.x"], format="f", digits=3)
eq=paste("y=",sl,"x +", intc,sep="")
r=paste("R2=", r2, sep="")
legend("topleft",c(eq,r),bty="n")
#plot overlapping samples on the map
require(maps)
require(mapdata)
coastline = maps::map( "worldHires", regions=c("Canada", "USA"), xlim=c(-71, -48 ), ylim=c(37, 50), fill=FALSE, plot=FALSE )
plot( coastline$x, coastline$y, pch="." , xlab="Longitude", ylab="Latitude",
main=paste(length(o), " overlapping samples\n", method1, "-", method2), cex.main=0.9)
points(dfm$header_start_lon.x,dfm$header_start_lat.x,cex=0.7, col="dodgerblue")
}
}
}
# ===== end for checking for overlaps ======
# have to decide what to do with overlapping points,
# which method to choose, or to correct
# it was decided to remove Welschmayer chl for overlapping samples
# First, make a character vector from all overlapping uid
overlap_uid=unlist(overlapping_uid)
# find indices for Welschmayer in overlapping samples
mw=which(ncf$method=="Chl_a_Welschmeyer_sF" & ncf$uid %in% overlap_uid)
# delete overlapping Welschmayer Chl from the dataframe
ncf=ncf[-mw,]
# ===== Data filtering complete ======
out=ncf
save(out, file=fn, compress=TRUE )
return( "Completed filtering step" )
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.