背景:文本数据字段太多,只需要读取有用字段,以此降低内存压力方便分析
方法一:read.table、read.csv
指定列读取
直接使用自带函数设定适当的参数(colClasses
),该参数需要指定每列数据的类型(可以使用nrows
参数读取几行查看class
),不需要的列指定为NULL
即可。如下:
dat <- structure(list(Year = 2009:2011, Jan = c(-41L, -41L, -21L),
Feb = c(-27L, -27L, -27L), Mar = c(-25L, -25L, -2L),
Apr = c(-31L, -31L, -6L), May = c(-31L, -31L, -10L),
Jun = c(-39L, -39L, -32L), Jul = c(-25L, -25L, -13L),
Aug = c(-15L, -15L, -12L), Sep = c(-30L, -30L, -27L),
Oct = c(-27L, -27L, -30L), Nov = c(-21L, -21L, -38L),
Dec = c(-25L, -25L, -29L)),
.Names = c("Year", "Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"),
class = "data.frame",
row.names = c(NA, -3L))
write.table(dat, "test.txt", row.names=FALSE)
## 查看每个列的class
df <- read.table("test.txt", nrow=2, header = TRUE)
apply(df, 2, class)
# Year Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
# "integer" "integer" "integer" "integer" "integer" "integer" "integer" "integer" "integer" "integer" "integer" "integer" "integer"
df <- read.table("test.txt",
colClasses = c(rep("integer", 7), rep("NULL", 6)),
header = TRUE)
# > df
# Year Jan Feb Mar Apr May Jun
# 1 2009 -41 -27 -25 -31 -31 -39
# 2 2010 -41 -27 -25 -31 -31 -39
# 3 2011 -21 -27 -2 -6 -10 -32
write.csv(dat, "test.csv", row.names=FALSE)
df <- read.csv("test.csv",
colClasses = c(rep("integer", 7), rep("NULL", 6)),
header = TRUE)
# > df
# Year Jan Feb Mar Apr May Jun
# 1 2009 -41 -27 -25 -31 -31 -39
# 2 2010 -41 -27 -25 -31 -31 -39
# 3 2011 -21 -27 -2 -6 -10 -32
方法二:使用package:colbycol
colbycol
没有安装成功,停止更新,好像也不怎么支持了
方法三:使用其他R package
辅助实现(如RJDBC
数据库操作)
实际就是用Java
来解决这个问题,需要安装配置多个环境,维护成本大、太复杂没有去实现
library(RJDBC)
write.table(x=mtcars, file="mtcars.csv", sep=",", row.names=F, col.names=T)
path.to.jdbc.driver <- "jdbc//csvjdbc-1.0-18.jar"
drv <- JDBC("org.relique.jdbc.csv.CsvDriver", path.to.jdbc.driver)
conn <- dbConnect(drv, sprintf("jdbc:relique:csv:%s", getwd()))
head(dbGetQuery(conn, "select * from mtcars"), 3)
# mpg cyl disp hp drat wt qsec vs am gear carb
# 1 21 6 160 110 3.9 2.62 16.46 0 1 4 4
# 2 21 6 160 110 3.9 2.875 17.02 0 1 4 4
# 3 22.8 4 108 93 3.85 2.32 18.61 1 1 4 1
head(dbGetQuery(conn, "select mpg, gear from mtcars"), 3)
方法四:借助linux
命令行工具实现
快速,方便;需要熟悉awk,cut
的语法
cut
功能比较单一适合处理分割整齐的数据,而awk
的功能更加强大(awk使用1,awk使用2)
dat <- structure(list(Year = 2009:2011, Jan = c(-41L, -41L, -21L),
Feb = c(-27L, -27L, -27L), Mar = c(-25L, -25L, -2L),
Apr = c(-31L, -31L, -6L), May = c(-31L, -31L, -10L),
Jun = c(-39L, -39L, -32L), Jul = c(-25L, -25L, -13L),
Aug = c(-15L, -15L, -12L), Sep = c(-30L, -30L, -27L),
Oct = c(-27L, -27L, -30L), Nov = c(-21L, -21L, -38L),
Dec = c(-25L, -25L, -29L)),
.Names = c("Year", "Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"),
class = "data.frame",
row.names = c(NA, -3L))
# 使用制表符分割
write.table(dat, "test.txt", row.names=FALSE, sep = "\t")
df <- read.table(pipe("cut -f 1,5 test.txt"), header=TRUE)
df
system("cut -f 1,5 test.txt")
# Year Apr
# 1 2009 -31
# 2 2010 -31
# 3 2011 -6
# 使用空格分割
write.table(dat, "test.txt", row.names=FALSE, sep = " ")
df <- read.table(pipe("cut -d ' ' -f 1,5 test.txt"), header=TRUE)
df
system("cut -d ' ' -f 1,5 test.txt")
# Year Apr
# 1 2009 -31
# 2 2010 -31
# 3 2011 -6
总结
- 数据量不是很大且对数据内容了解时,可以使用read.table指定合理的参数colClasses读取。
- 数据较大且要求速度建议借助linux下的数据处理工具