R/preprocessPost.R

#' @title Preprocess
#' @description Extract useful data frome database, and organize them to a needed format
#' @param district City name, character
#' @param host Host of the server, character
#' @param port Port of the server, numeric
#' @param user User of the server, character
#' @param password Password of the server, character
#' @param dbname Database name of city, character
#' @param mon Current month in the form as param startmon, character
#' @param enctype The encoding type, gbk or utf8 or else ? character
#' @return A list: 
#' @return (1) preproccessed data, data frame
#' @return (2) the first month conform to some regularity, character
#' @return (3) the last month conform to some regularity, character
#' @return Note: here, the total number of sample/month should be greater than 10.
#' @export
preprocessPost<-function(district,host,port,user,password,dbname,mon,enctype,outpath){
  
  ##############################################
  #the used libraries and functions:           #
  #  library(maps)                             #
  #  library(mapdata)                          #
  #  library(sp)                               #
  #  library(ggplot2)                          #
  #  library(maptools)                         #
  #  library(readxl)                           #
  #  library(tcltk)                            #
  #  library(RMySQL)                           #
  #  source("groupping.R",encoding = 'UTF-8')  #
  ##############################################
  # the last month
  # the month of last year, the laste month
  if (as.numeric(substr(mon,5,6)) > 1)
  {
    lastmon <- as.character(as.numeric(mon)-1)
  }else{
    lastmon <- as.character(as.numeric(mon)-89)
  }
  
  ### read the sample data of the laster month
  prefile <- paste0(outpath,"/pre-data/",lastmon,'/',district,".txt")
  if (file.exists(prefile))
  {
    result <- read.table(prefile,header = TRUE,fileEncoding = 'utf-8')
    names(result)[8:(ncol(result)-1)] <- gsub("X","",names(result)[8:(ncol(result)-1)])
  }else{
    result <- data.frame("code"=c(NA),"name"=c(NA),"dist"=c(NA),"long"=c(NA),
                         "lat"=c(NA),"no1"=c(NA),"no2"=c(NA),"st"=c(NA))
    result <- result[-1,]
  }
  
  #############################################################################
  ################################# fetch data ################################
  #############################################################################
  # entrance the database
  con <- try(dbConnect(MySQL(),host=host,dbname=dbname,user=user,
                       password=password,port=port),silent = TRUE)
  if (class(con) == "try-error"){
    cat("The database cannot be connected !!!")
    return(0)
  }
  dbSendQuery(con,enctype)
  
  # fetch price data
  if (substr(mon,5,6) == "12") {
    mon_yr <- as.character(as.numeric(substr(mon,1,4))+1)
    mon_mon <- "01"
  }else{
    mon_yr <- substr(mon,1,4)
    mon_mon <- as.character(as.numeric(substr(mon,5,6))+1)
  }
  str = paste0("SELECT ha_code,quotedprice FROM ha_quote
                      WHERE ha_code in (SELECT ha_code FROM ha WHERE ha_cl_code='pa' AND sale_phase<3 AND confirmid=2)
                      and confirm_id in (1,2)
                      and quotedprice IS NOT NULL and ha_code IS NOT NULL
                      and quotedprice <> 'NA' and ha_code <> 'NA'
                      and quotedprice <> '' and ha_code <> ''
                      and priceexplain='均价'
                      and begintime >= '",substr(mon,1,4),"-",substr(mon,5,6),"-01 00:00:00","'
                      and begintime < '",mon_yr,"-",mon_mon,"-01 00:00:00","'")
  price <- dbGetQuery(con,str)
  names(price) <- c("code",mon)
  
  # fetch community data
  community <- dbGetQuery(con, "SELECT ha_code,ha_name,dist_name,avg(x) AS 'long',avg(y) AS 'lat',id,remark FROM ha_position 
                          WHERE ha_code in (SELECT ha_code FROM ha WHERE ha_cl_code='pa' AND sale_phase<3 AND confirmid=2)
                          and x IS NOT NULL and y IS NOT NULL and ha_code IS NOT NULL
                          and x <> 'NA' and y <> 'NA' and ha_code <> 'NA'
                          and x <> '' and y <> '' and ha_code <> ''
                          GROUP BY ha_code")
  names(community) <- c("code","name","dist","long","lat","no1","no2")

  # fetch seld out time
  stop <- dbGetQuery(con, "SELECT ha_code,begin_time FROM ha_sale_state 
                     WHERE ha_code in (SELECT ha_code FROM ha WHERE ha_cl_code='pa' AND sale_phase<3 AND confirmid=2)
                     and sale_code=3 and ha_code IS NOT NULL
                     and ha_code <> 'NA' and ha_code <> ''")
  names(stop) <- c("code","st")
  stop$st<-paste0(substr(stop$st,1,4),substr(stop$st,6,7))
  dbDisconnect(con)
  
  #############################################################################
  #################### combine to "result" dataframe ##########################
  #############################################################################
  # combine community and price
  communityPrice <- merge(community,price,by="code")
  # combine result0 and community-price
  result <- merge(result,communityPrice,by=intersect(names(result),names(communityPrice)),all=T)
  result <- cbind(result[1:(ncol(result)-2)],result[ncol(result)],result[ncol(result)-1])
  
  # 删除重复点
  result<-result[!(duplicated(result[4:5])),]
  if (nrow(result) == 0) {
    cat("There's no record about house price !!!")
    return(0)
  }
  
  # 整理售罄时间
  for (j in 1:nrow(result))
  {
    result[j,ncol(result)]<-subset(stop,stop$code==result[j,1])[1,2]
  }
  
  # 删除给定月份无报价数据的所有楼盘数据
  ntemp<-ncol(result)
  for (j in 1:nrow(result))
  {
    result$ok[j]<-!all(is.na(result[j,8:(ntemp-1)]))
  }
  result<-subset(result,ok)
  result<-result[-ncol(result)]
  if (nrow(result) == 0){
    cat("There's something wrong with the database, ignore this problem !!!")
    return(0)
  }
  row.names(result)<-c(1:nrow(result))
  
  # 本月度在result中的列数
  moncol <- ncol(result)-1
  
  # 填充该月度价格数据(开始报价至今)
  if (moncol > 8){
    for (j in 1:nrow(result))
    {
      if (is.na(result[j,moncol])) {result[j,moncol]=result[j,moncol-1]}
    }
  }
  
  # 剔除售罄后数据
  if (moncol > 8){
    for (j in 1:nrow(result))
    {
      if (!is.na(result[j,ncol(result)]) & 
          as.numeric(names(result[moncol])) > as.numeric(result[j,ncol(result)]))
      {
        result[j,moncol]=NA
      }
    }
  }
  
  # # 剔除样本点小于10的月度数据
  # if(sum(!is.na(result[moncol])) < 10){result <- result[-moncol]}
  
  if (ncol(result) < 9){
    cat("Records less than 10 !!!")
    return(0)
  }
  
  # update pre-data
  write.table(result,paste0(outpath,"/pre-data/",mon,'/',district,".txt"),
              row.names = FALSE,sep='\t', fileEncoding = 'utf-8')
  
  return(result)
}
Menglinucas/HousePC_serial-parallel documentation built on May 13, 2019, 10:59 p.m.