#' Check the range of responses
#' @param x dataframe.
#' @param redcapid Autogenerated id number from REDCap, Ethica, etc
#' @param id Study id (e.g., participant initials, IE001, etc.)
#' @param birthdate Column name with participant birthdate
#' @param gender Column name with participant gender
#' @param height Column name with participant height
#' @param weight Column name with participant weight
#' @param ethnicity Column name with participant ethnicity
#' @param email Optional argument. Column name with participant email
#' @return A message indicating how many duplicates were removed from your dataset and your dataset with the removed duplicates.
#' @examples
#' dupsbydemos(fake_data, redcapid="id", id="si", birthdate = "birthdate", gender="demo_2", height="height", weight="weight", ethnicity = "ethnicity", email="demo_11")
#' @export
#'
dupsbydemos=function(x, redcapid, id, birthdate, gender, height, weight, ethnicity, email){
  object_name <- deparse(substitute(x))
  start_n=dim(x)[1]
  # optional email
  if(missing(email))
    {# the people with same initials, birthdates, and weights
    easyfirst=x[(duplicated(x[c(birthdate, id, weight)]) | duplicated(x[c(birthdate,id,weight)], fromLast = TRUE)), ]
    # keep people who have more than id to go on
    valid=easyfirst[rowSums(is.na(easyfirst[,2:9]))!=8,]
    # split possible duplicates into individual dataframes
    dfs <- lapply(split(valid, valid[[id]]), data.frame, stringsAsFactors = FALSE)
    # add column with rowMeans
    dfs <- lapply(dfs, function(x) cbind(x,"pct_na"=rowMeans(is.na(x))))
    # get the groups that aren't min
    notmin=lapply(dfs, function(x) if(all(x$pct_na == x$pct_na[1])) return(x[x[redcapid] > min(x[redcapid]),]) else return(x[x$pct_na < max(x$pct_na),]))
    # remove these rows
    x=x[!(x[[redcapid]] %in% unlist(unname(sapply(notmin, `[[`, 1)))), ]
  #### NEXT SET: birthdate, id, height, ethnicity match ####
  easysecond=x[(duplicated(x[c(birthdate,id,height, ethnicity)]) | duplicated(x[c(birthdate,id, height, ethnicity)], fromLast = TRUE)), ]
  easysecond=easysecond[rowSums(is.na(easysecond[,2:9]))!=8,]
  dfs <- lapply(split(easysecond, easysecond[[id]]), data.frame, stringsAsFactors = FALSE)
  # add column with rowMeans
  dfs <- lapply(dfs, function(x) cbind(x,"pct_na"=rowMeans(is.na(x))))
  # get the groups that aren't max
  notmin=lapply(dfs, function(x) if(all(x$pct_na == x$pct_na[1])) return(x[x[redcapid] < max(x[redcapid]),]) else return(x[x$pct_na > min(x$pct_na),]))
  # remove these rows
  x=x[!(x[[redcapid]] %in% unlist(unname(sapply(notmin, `[[`, 1)))), ]
  #### LOOK AGAIN-match except birthdate people put 2020/2021 ####
  easythird=x[(duplicated(x[c(id,height, weight, ethnicity)]) | duplicated(x[c(id, height, weight, ethnicity)], fromLast = TRUE)), ]
  easythird=easythird[rowSums(is.na(easythird[,2:9]))!=8,]
  dfs <- lapply(split(easythird, easythird[[id]]), data.frame, stringsAsFactors = FALSE)
  # add column with rowMeans
  dfs <- lapply(dfs, function(x) cbind(x,"pct_na"=rowMeans(is.na(x))))
  # get the groups that aren't max
  notmin=lapply(dfs, function(x) if(all(x$pct_na == x$pct_na[1])) return(x[x[[redcapid]] < max(x[[redcapid]]),]) else return(x[x$pct_na > min(x$pct_na),]))
  # remove these rows
  x=x[!(x[[redcapid]] %in% unlist(unname(sapply(notmin, `[[`, 1)))), ]
  } else {
    easyfirst=x[(duplicated(x[c(birthdate, id, weight)]) | duplicated(x[c(birthdate,id,weight)], fromLast = TRUE)), ]
    # keep people who have more than id to go on
    valid=easyfirst[rowSums(is.na(easyfirst[,2:9]))!=8,]
    # split possible duplicates into individual dataframes
    dfs <- lapply(split(valid, valid[[id]]), data.frame, stringsAsFactors = FALSE)
    # add column with rowMeans
    dfs <- lapply(dfs, function(x) cbind(x,"pct_na"=rowMeans(is.na(x))))
    # get the groups that aren't max
    notmin=lapply(dfs, function(x) if(all(x$pct_na == x$pct_na[1])) return(x[x[redcapid] > min(x[redcapid]),]) else return(x[x$pct_na < max(x$pct_na),]))
    # remove these rows
    x=x[!(x[[redcapid]] %in% unlist(unname(sapply(notmin, `[[`, 1)))), ]
    #### NEXT SET: birthdate, id, height, ethnicity match ####
    easysecond=x[(duplicated(x[c(birthdate,id,height, ethnicity)]) | duplicated(x[c(birthdate,id, height, ethnicity)], fromLast = TRUE)), ]
    easysecond=easysecond[rowSums(is.na(easysecond[,2:9]))!=8,]
    dfs <- lapply(split(easysecond, easysecond[[id]]), data.frame, stringsAsFactors = FALSE)
    # add column with rowMeans
    dfs <- lapply(dfs, function(x) cbind(x,"pct_na"=rowMeans(is.na(x))))
    # get the groups that aren't max
    notmin=lapply(dfs, function(x) if(all(x$pct_na == x$pct_na[1])) return(x[x[redcapid] < max(x[redcapid]),]) else return(x[x$pct_na > min(x$pct_na),]))
    # remove these rows
    x=x[!(x[[redcapid]] %in% unlist(unname(sapply(notmin, `[[`, 1)))), ]
    #### LOOK AGAIN-match except birthdate people put 2020/2021 ####
    easythird=x[(duplicated(x[c(id,height, weight, ethnicity)]) | duplicated(x[c(id, height, weight, ethnicity)], fromLast = TRUE)), ]
    easythird=easythird[rowSums(is.na(easythird[,2:9]))!=8,]
    dfs <- lapply(split(easythird, easythird[[id]]), data.frame, stringsAsFactors = FALSE)
    # add column with rowMeans
    dfs <- lapply(dfs, function(x) cbind(x,"pct_na"=rowMeans(is.na(x))))
    # get the groups that aren't max
    notmin=lapply(dfs, function(x) if(all(x$pct_na == x$pct_na[1])) return(x[x[[redcapid]] < max(x[[redcapid]]),]) else return(x[x$pct_na > min(x$pct_na),]))
    # remove these rows
    x=x[!(x[[redcapid]] %in% unlist(unname(sapply(notmin, `[[`, 1)))), ]
    #### look emails ####
    lookemail=x[(duplicated(test[email]) | duplicated(test[email], fromLast = TRUE)), ]
    lookemail=lookemail[-which(is.na(lookemail[email])),]
    dfs <- lapply(split(lookemail, lookemail[[email]]), data.frame, stringsAsFactors = FALSE)
    # add column with rowMeans
    dfs <- lapply(dfs, function(x) cbind(x,"pct_na"=rowMeans(is.na(x))))
    # get the groups that aren't max
    notmin=lapply(dfs, function(x) if(all(x$pct_na == x$pct_na[1])) return(x[x[[redcapid]] < max(x[[redcapid]]),]) else return(x[x$pct_na > min(x$pct_na),]))
    # remove these rows
    x=x[!(x[[redcapid]] %in% unlist(unname(sapply(notmin, `[[`, 1)))), ]
  }
  end_n=dim(x)[1]
  number_removed = start_n-end_n
  # save this df to environment so it includes the new weight col
  assign(object_name, value=x, envir = globalenv())
  message = paste0(number_removed, " observations were identified and removed as duplicates based on ids with matching weights, heights, emails, and ethnicity. Examine other possible duplicates this function did not catch below")
  return(message)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.