clairecleans: This package contains some go-to functions for me and the EAT Lab.

Documented in dupsbydemos

#' Check the range of responses
#' @param x dataframe.
#' @param redcapid Autogenerated id number from REDCap, Ethica, etc
#' @param id Study id (e.g., participant initials, IE001, etc.)
#' @param birthdate Column name with participant birthdate
#' @param gender Column name with participant gender
#' @param height Column name with participant height
#' @param weight Column name with participant weight
#' @param ethnicity Column name with participant ethnicity
#' @param email Optional argument. Column name with participant email
#' @return A message indicating how many duplicates were removed from your dataset and your dataset with the removed duplicates.
#' @examples
#' dupsbydemos(fake_data, redcapid="id", id="si", birthdate = "birthdate", gender="demo_2", height="height", weight="weight", ethnicity = "ethnicity", email="demo_11")
#' @export
#'
dupsbydemos=function(x, redcapid, id, birthdate, gender, height, weight, ethnicity, email){
  object_name <- deparse(substitute(x))
  start_n=dim(x)[1]

  # optional email
  if(missing(email))

    {# the people with same initials, birthdates, and weights
    easyfirst=x[(duplicated(x[c(birthdate, id, weight)]) | duplicated(x[c(birthdate,id,weight)], fromLast = TRUE)), ]
    # keep people who have more than id to go on
    valid=easyfirst[rowSums(is.na(easyfirst[,2:9]))!=8,]

    # split possible duplicates into individual dataframes
    dfs <- lapply(split(valid, valid[[id]]), data.frame, stringsAsFactors = FALSE)

    # add column with rowMeans
    dfs <- lapply(dfs, function(x) cbind(x,"pct_na"=rowMeans(is.na(x))))

    # get the groups that aren't min
    notmin=lapply(dfs, function(x) if(all(x$pct_na == x$pct_na[1])) return(x[x[redcapid] > min(x[redcapid]),]) else return(x[x$pct_na < max(x$pct_na),]))

    # remove these rows
    x=x[!(x[[redcapid]] %in% unlist(unname(sapply(notmin, `[[`, 1)))), ]


  #### NEXT SET: birthdate, id, height, ethnicity match ####

  easysecond=x[(duplicated(x[c(birthdate,id,height, ethnicity)]) | duplicated(x[c(birthdate,id, height, ethnicity)], fromLast = TRUE)), ]
  easysecond=easysecond[rowSums(is.na(easysecond[,2:9]))!=8,]

  dfs <- lapply(split(easysecond, easysecond[[id]]), data.frame, stringsAsFactors = FALSE)

  # add column with rowMeans
  dfs <- lapply(dfs, function(x) cbind(x,"pct_na"=rowMeans(is.na(x))))

  # get the groups that aren't max
  notmin=lapply(dfs, function(x) if(all(x$pct_na == x$pct_na[1])) return(x[x[redcapid] < max(x[redcapid]),]) else return(x[x$pct_na > min(x$pct_na),]))

  # remove these rows
  x=x[!(x[[redcapid]] %in% unlist(unname(sapply(notmin, `[[`, 1)))), ]



  #### LOOK AGAIN-match except birthdate people put 2020/2021 ####

  easythird=x[(duplicated(x[c(id,height, weight, ethnicity)]) | duplicated(x[c(id, height, weight, ethnicity)], fromLast = TRUE)), ]
  easythird=easythird[rowSums(is.na(easythird[,2:9]))!=8,]

  dfs <- lapply(split(easythird, easythird[[id]]), data.frame, stringsAsFactors = FALSE)

  # add column with rowMeans
  dfs <- lapply(dfs, function(x) cbind(x,"pct_na"=rowMeans(is.na(x))))

  # get the groups that aren't max
  notmin=lapply(dfs, function(x) if(all(x$pct_na == x$pct_na[1])) return(x[x[[redcapid]] < max(x[[redcapid]]),]) else return(x[x$pct_na > min(x$pct_na),]))

  # remove these rows
  x=x[!(x[[redcapid]] %in% unlist(unname(sapply(notmin, `[[`, 1)))), ]
  } else {
    easyfirst=x[(duplicated(x[c(birthdate, id, weight)]) | duplicated(x[c(birthdate,id,weight)], fromLast = TRUE)), ]
    # keep people who have more than id to go on
    valid=easyfirst[rowSums(is.na(easyfirst[,2:9]))!=8,]

    # split possible duplicates into individual dataframes
    dfs <- lapply(split(valid, valid[[id]]), data.frame, stringsAsFactors = FALSE)
    # add column with rowMeans
    dfs <- lapply(dfs, function(x) cbind(x,"pct_na"=rowMeans(is.na(x))))
    # get the groups that aren't max
    notmin=lapply(dfs, function(x) if(all(x$pct_na == x$pct_na[1])) return(x[x[redcapid] > min(x[redcapid]),]) else return(x[x$pct_na < max(x$pct_na),]))
    # remove these rows
    x=x[!(x[[redcapid]] %in% unlist(unname(sapply(notmin, `[[`, 1)))), ]


    #### NEXT SET: birthdate, id, height, ethnicity match ####

    easysecond=x[(duplicated(x[c(birthdate,id,height, ethnicity)]) | duplicated(x[c(birthdate,id, height, ethnicity)], fromLast = TRUE)), ]
    easysecond=easysecond[rowSums(is.na(easysecond[,2:9]))!=8,]

    dfs <- lapply(split(easysecond, easysecond[[id]]), data.frame, stringsAsFactors = FALSE)

    # add column with rowMeans
    dfs <- lapply(dfs, function(x) cbind(x,"pct_na"=rowMeans(is.na(x))))

    # get the groups that aren't max
    notmin=lapply(dfs, function(x) if(all(x$pct_na == x$pct_na[1])) return(x[x[redcapid] < max(x[redcapid]),]) else return(x[x$pct_na > min(x$pct_na),]))

    # remove these rows
    x=x[!(x[[redcapid]] %in% unlist(unname(sapply(notmin, `[[`, 1)))), ]



    #### LOOK AGAIN-match except birthdate people put 2020/2021 ####

    easythird=x[(duplicated(x[c(id,height, weight, ethnicity)]) | duplicated(x[c(id, height, weight, ethnicity)], fromLast = TRUE)), ]
    easythird=easythird[rowSums(is.na(easythird[,2:9]))!=8,]

    dfs <- lapply(split(easythird, easythird[[id]]), data.frame, stringsAsFactors = FALSE)

    # add column with rowMeans
    dfs <- lapply(dfs, function(x) cbind(x,"pct_na"=rowMeans(is.na(x))))

    # get the groups that aren't max
    notmin=lapply(dfs, function(x) if(all(x$pct_na == x$pct_na[1])) return(x[x[[redcapid]] < max(x[[redcapid]]),]) else return(x[x$pct_na > min(x$pct_na),]))

    # remove these rows
    x=x[!(x[[redcapid]] %in% unlist(unname(sapply(notmin, `[[`, 1)))), ]



    #### look emails ####
    lookemail=x[(duplicated(test[email]) | duplicated(test[email], fromLast = TRUE)), ]
    lookemail=lookemail[-which(is.na(lookemail[email])),]

    dfs <- lapply(split(lookemail, lookemail[[email]]), data.frame, stringsAsFactors = FALSE)

    # add column with rowMeans
    dfs <- lapply(dfs, function(x) cbind(x,"pct_na"=rowMeans(is.na(x))))

    # get the groups that aren't max
    notmin=lapply(dfs, function(x) if(all(x$pct_na == x$pct_na[1])) return(x[x[[redcapid]] < max(x[[redcapid]]),]) else return(x[x$pct_na > min(x$pct_na),]))

    # remove these rows
    x=x[!(x[[redcapid]] %in% unlist(unname(sapply(notmin, `[[`, 1)))), ]

  }





  end_n=dim(x)[1]
  number_removed = start_n-end_n

  # save this df to environment so it includes the new weight col
  assign(object_name, value=x, envir = globalenv())

  message = paste0(number_removed, " observations were identified and removed as duplicates based on ids with matching weights, heights, emails, and ethnicity. Examine other possible duplicates this function did not catch below")
  return(message)

}

cecusack/clairecleans documentation built on Dec. 14, 2024, 3:27 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

cecusack/clairecleans
This package contains some go-to functions for me and the EAT Lab.

R/dups.R
In cecusack/clairecleans: This package contains some go-to functions for me and the EAT Lab.

Defines functions dupsbydemos

Documented in dupsbydemos

R Package Documentation

Browse R Packages

We want your feedback!

cecusack/clairecleans This package contains some go-to functions for me and the EAT Lab.

R/dups.R In cecusack/clairecleans: This package contains some go-to functions for me and the EAT Lab.

Defines functions dupsbydemos

Documented in dupsbydemos

R Package Documentation

Browse R Packages

We want your feedback!

cecusack/clairecleans
This package contains some go-to functions for me and the EAT Lab.

R/dups.R
In cecusack/clairecleans: This package contains some go-to functions for me and the EAT Lab.