-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDataPreprocessing.R
71 lines (40 loc) · 1.59 KB
/
DataPreprocessing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# The Problem Statement is To find out the selling potential of the property based on the paramenters will need
# to findout that the property will be sold in 3 months or not
df <- read.csv("HousePrice.csv",header = TRUE)
View(df)
str(df) # will give structure of df
# EDD (extended data dictionary)
summary(df)
# Graphs
boxplot(df$n_hot_rooms) # here can see outliers
pairs(~df$Sold+df$rainfall) # can see outlier
barplot(table(df$airport))
barplot(table(df$bus_ter))
#############Conslusion from graphs#############
# rainfall,n_hot_rooms have outliers
# bus_terminal variable is not of use as it dont make diffrence
# n_hos_beds as missing value
################################################
# Treating Outliers
uv<- 3*quantile(df$n_hot_rooms,0.99)
df$n_hot_rooms[df$n_hot_rooms>uv] <- uv
lv <- 0.3*quantile(df$rainfall,0.01)
df$rainfall[df$rainfall<lv] <- lv
# Treating NA values
which(is.na(df$n_hos_beds))
df$n_hos_beds[is.na(df$n_hos_beds)] <- mean(df$n_hos_beds,na.rm = TRUE)
mean(df$n_hos_beds)
# Variable transformation
df$avg_dist = (df$dist1+df$dist2+df$dist3+df$dist4)/4 # taking out the average and creating a column with it
tempdf <- df[,-6:-9]# df without column 6 to 9
df = tempdf
rm(tempdf) # remover temp variable
df <- df[,-13] # removing the not usefull columns of bus_ter
# catagorical variable handling
# Dummy Variable
install.packages("dummies")
library(dummies)
df<- dummy.data.frame(df)
# removing the column with dummy variable as only one is needed from yes or no
df<- df[,-8]
df<-df[,-13]