This is a quick trial of adding overall and conditional (by user) average columns in a data frame. base
,plyr
,dplyr
,data.table
,dplyr + data.table
packages are used. Personally I perfer dplyr + data.table
- dplyr
for comperhensive syntax and data.table
for speed.
1## set up variables
2size <- 36000
3numUsers <- 4900
4# roughly each user has 7 sessions
5numSessions <- (numUsers / 7) - ((numUsers / 7) %% 1)
6
7## create data frame
8set.seed(123457)
9userIds <- sample.int(numUsers, size=size, replace=TRUE)
10ssIds <- sample.int(numSessions, size=size, replace=TRUE)
11scores <- sample.int(10, size=size, replace=TRUE)
12
13preDf <- data.frame(User=userIds, Session=ssIds, Score=scores)
14preDf$User <- as.factor(preDf$User)
Adding overall average
As calculating overall average is not complicated, I don’t find a big difference among packages.
1# base
2system.time(overallDf1 <- transform(preDf, MeanScore=mean(Score, na.rm=TRUE)))
## user system elapsed
## 0.001 0.000 0.002
1# plyr
2require(plyr)
3system.time(overallDf2 <- mutate(preDf, MeanScore=mean(Score, na.rm=TRUE)))
## user system elapsed
## 0.002 0.000 0.002
1# dplyr
2require(dplyr)
3system.time(overallDf3 <- preDf %>%
4 mutate(MeanScore=mean(Score, na.rm=TRUE)))
## user system elapsed
## 0.007 0.000 0.007
1# data.table
2require(data.table)
3preDt <- data.table(preDf)
4setkey(preDt, User)
5system.time(overallDt <- preDt[,list(User=User
6 ,Session=Session
7 ,Score=Score
8 ,MeanScore=mean(Score, na.rm=T))])
## user system elapsed
## 0.007 0.000 0.007
1# dplyr + data.table
2system.time(overallDf4 <- preDt %>%
3 mutate(MeanScore=mean(Score, na.rm=TRUE)))
## user system elapsed
## 0.003 0.000 0.003
Adding average by user
It takes quite long using plyr
and other packages would be more practial - base
is not considered even.
1# plyr
2require(plyr)
3system.time(postDf1 <- ddply(preDf
4 ,.(User)
5 ,mutate,MeanScore=mean(Score, na.rm=TRUE)))
## user system elapsed
## 76.488 0.522 76.990
1# dplyr
2require(dplyr)
3system.time(postDf2 <- preDf %>%
4 group_by(User) %>%
5 mutate(MeanScore=mean(Score, na.rm=TRUE)) %>%
6 arrange(User))
## user system elapsed
## 0.022 0.006 0.028
1# data.table
2require(data.table)
3preDt <- data.table(preDf)
4setkey(preDt, User)
5system.time(postDt <- preDt[,list(Session=Session
6 ,Score=Score
7 ,MeanScore=mean(Score, na.rm=T))
8 ,by=User])
## user system elapsed
## 0.005 0.004 0.009
1# dplyr + data.table
2system.time(postDf3 <- preDt %>%
3 group_by(User) %>%
4 mutate(MeanScore=mean(Score, na.rm=TRUE)) %>%
5 arrange(User))
## user system elapsed
## 0.008 0.004 0.012
Comments