R: read multi PDF ke tidytext: Difference between revisions
From OnnoCenterWiki
Jump to navigationJump to search
Onnowpurbo (talk | contribs) Created page with " library(tidyverse) library(tidytext) library(tm) directory <- "data-pdf" # create corpus from pdfs converted <- VCorpus(DirSource(directory), readerControl = list(rea..." |
Onnowpurbo (talk | contribs) No edit summary |
||
| Line 1: | Line 1: | ||
==dengan tidytext dan tm== | |||
library(tidyverse) | library(tidyverse) | ||
| Line 14: | Line 15: | ||
filter(!grepl("[0-9]+", term)) | filter(!grepl("[0-9]+", term)) | ||
==dengan tidytest tanpa tm== | |||
directory <- "data-pdf" | |||
pdfs <- paste(directory, "/", list.files(directory, pattern = "*.pdf"), sep = "") | |||
pdf_names <- list.files(directory, pattern = "*.pdf") | |||
pdfs_text <- map(pdfs, pdftools::pdf_text) | |||
my_data <- data_frame(document = pdf_names, text = pdfs_text) | |||
my_data %>% | |||
unnest %>% # pdfs_text is a list | |||
unnest_tokens(word, text, strip_numeric = TRUE) %>% # removing all numbers | |||
group_by(document, word) %>% | |||
summarise(count = n()) | |||
Revision as of 04:30, 6 November 2018
dengan tidytext dan tm
library(tidyverse) library(tidytext) library(tm) directory <- "data-pdf"
# create corpus from pdfs converted <- VCorpus(DirSource(directory), readerControl = list(reader = readPDF)) %>% DocumentTermMatrix()
converted %>%
tidy() %>%
filter(!grepl("[0-9]+", term))
dengan tidytest tanpa tm
directory <- "data-pdf" pdfs <- paste(directory, "/", list.files(directory, pattern = "*.pdf"), sep = "") pdf_names <- list.files(directory, pattern = "*.pdf") pdfs_text <- map(pdfs, pdftools::pdf_text) my_data <- data_frame(document = pdf_names, text = pdfs_text) my_data %>% unnest %>% # pdfs_text is a list unnest_tokens(word, text, strip_numeric = TRUE) %>% # removing all numbers group_by(document, word) %>% summarise(count = n())