- Had you with to convert PDF file tables to excel or dataframe in R.
- here is one of the way to do it in R. Example file is here
- Intall required libraries
- “tabulizer is available in github”
- You can install it as follows
- First, install.packages(“devtools”), then run following codes if require.
- library(devtools) # needs to install from github
- install_github(c(“ropenscilabs/tabulizerjars”, “ropenscilabs/tabulizer”)) # install “tabulizer”
library(tabulizer)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
- Give the PDF file location
path.file <- "...../temp data.pdf"
out.tb <- extract_tables(path.file)
out.tb
## [[1]]
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
## [1,] "month" "1977" "1978" "1979" "1980" "1981" "1982" "1983" "1984"
## [2,] "Jan" "10.3" "10.3" "13.9" "8.8" "8.1" "7.9" "7.1" "5.5"
## [3,] "Feb" "11.8" "8" "12" "10.6" "13.2" "7.4" "10.5" "11.3"
## [4,] "Mar" "15.4" "10.8" "13.4" "13.4" "13.8" "10" "13.1" "16.9"
## [5,] "Apr" "18.7" "16.9" "17.5" "19.1" "16.5" "17.5" "16.6" "18.3"
## [6,] "May" "20.3" "20.2" "19.6" "19.6" "18.4" "20.7" "18.3" "19.7"
## [7,] "Jun" "22" "20.3" "20.3" "20.2" "20.4" "21.9" "21.3" "19.8"
## [8,] "Jul" "23.5" "20" "19.3" "19.1" "19.9" "21" "21.2" "19.9"
## [9,] "Aug" "24.5" "20" "19.3" "19" "18.4" "19.3" "20.2" "19.9"
## [10,] "Sep" "20.1" "17.9" "18.3" "18.5" "17.9" "20.9" "19.3" "19.9"
## [11,] "Oct" "17.2" "16.4" "16.1" "15.3" "15.7" "18.1" "15.3" "15.8"
## [12,] "Nov" "15.2" "11.4" "14.5" "10.6" "13.3" "13.3" "13.9" "13.9"
## [13,] "Dec" "16.5" "12.9" "10.6" "5.6" "10.2" "9.2" "11.4" "12"
## [,10]
## [1,] "1985"
## [2,] "10.3"
## [3,] "11.8"
## [4,] "15.4"
## [5,] "19.4"
## [6,] "19.2"
## [7,] "21.9"
## [8,] "18.8"
## [9,] "21"
## [10,] "18.8"
## [11,] "15.8"
## [12,] "13.6"
## [13,] "12.8"
- Convert to data frame for futher process
tb.df <- as.data.frame(out.tb)
head(tb.df)
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 month 1977 1978 1979 1980 1981 1982 1983 1984 1985
## 2 Jan 10.3 10.3 13.9 8.8 8.1 7.9 7.1 5.5 10.3
## 3 Feb 11.8 8 12 10.6 13.2 7.4 10.5 11.3 11.8
## 4 Mar 15.4 10.8 13.4 13.4 13.8 10 13.1 16.9 15.4
## 5 Apr 18.7 16.9 17.5 19.1 16.5 17.5 16.6 18.3 19.4
## 6 May 20.3 20.2 19.6 19.6 18.4 20.7 18.3 19.7 19.2
- Noticed the header of the table?
- Let’s change the colunm name, the column names are in first row of the table, so copy it to column name then assign and delete the row it from table
colnames(tb.df) = as.character(unlist(tb.df[1,]))
tb.df = tb.df[-1, ]
head(tb.df)
## month 1977 1978 1979 1980 1981 1982 1983 1984 1985
## 2 Jan 10.3 10.3 13.9 8.8 8.1 7.9 7.1 5.5 10.3
## 3 Feb 11.8 8 12 10.6 13.2 7.4 10.5 11.3 11.8
## 4 Mar 15.4 10.8 13.4 13.4 13.8 10 13.1 16.9 15.4
## 5 Apr 18.7 16.9 17.5 19.1 16.5 17.5 16.6 18.3 19.4
## 6 May 20.3 20.2 19.6 19.6 18.4 20.7 18.3 19.7 19.2
## 7 Jun 22 20.3 20.3 20.2 20.4 21.9 21.3 19.8 21.9
write.csv(tb.df, "......../pdf2table.csv")
DONE
No comments:
Post a Comment