-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimdb_scraping.R
79 lines (57 loc) · 1.75 KB
/
imdb_scraping.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
library(rvest)
library(stringr)
library(purrr)
library(RSQLite)
library(dplyr)
str_replace_all()
db_conn <- dbConnect(RSQLite::SQLite(), "~/imdb.db")
source('functions/scrape_movie.R')
source('functions/add_to_database.R')
source('functions/recreate_tables.R')
# Destroy and recreate all tables in sqlite db.
recreate_tables(db_conn)
# First iteration
# TODO: Exception for TV Shows
input_list <- scrape_movie("The Man in the Iron Mask")
movie_id <- add_to_database(input_list, db_conn)
current_movies <- data.frame(
source = movie_id,
target = input_list$similar,
stringsAsFactors = F
)
# Recursion for further iterations
while(TRUE) { # The main loop for the graph traversal
# We build in a delay so that we don't get blocked from imdbb
next_movies <- data.frame()
for (i in 1:nrow(current_movies)) {
source_id <- current_movies[i,1]
target_name <- current_movies[i,2]
count_movie <- dbGetQuery(db_conn, paste0(
"select
count(*) as count
from movies
where
title = '", target_name, "';"))
if (count_movie$count > 0) next
input_list <- tryCatch(scrape_movie(target_name), error = function(e) {return(NULL)})
if (input_list %>% is.null) next
movie_id <- add_to_database(input_list, db_conn)
# Insert into mov2mov (source_id, movie_id)
insert_junction_table(
tablename = "movies_to_movies",
source = source_id,
target = movie_id,
db_conn = db_conn
)
# Add movie_id, input_list$similar to next_movies
if (length(input_list$similar) > 0) {
next_movies <- data.frame(
source = movie_id,
target = input_list$similar,
stringsAsFactors = F
) %>% bind_rows(next_movies)
}
}
# Call this function again with next_movies as input
current_movies <- next_movies
}