-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathNotifyWebChanges.R
More file actions
172 lines (148 loc) · 6.12 KB
/
NotifyWebChanges.R
File metadata and controls
172 lines (148 loc) · 6.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# title: "NotifyWebchanges"
# author: "Xavier de Pedro"
# date: "04/01/2016"
# system.packages: sudo apt-get install sendemail
#install.packages(c("rvest", "dplyr", "sendmailR")
setwd(".")
require(methods)
require(rvest)
my.rda.file <- "last.biocat.jobs.Rda"
if (file.exists(my.rda.file)) {
load(file=my.rda.file)
jobs.list.all.previous <- jobs.list.all
} else {
jobs.list.all.previous <- NULL
}
#table(job.list.all.previous[,Perfil] == job.list.all[,Perfil])
url_base <- "http://www.biocat.cat/ca/que-fem/borsa-de-treball-i-practiques?page="
webpage <- list()
if (exists("jobs.list")) rm(jobs.list); jobs.list <- list()
if (exists("jobs.links")) rm(jobs.links); jobs.links <- list()
for (ii in 1:4) {
# download html files
webpage[[ii]] <- read_html(paste0(url_base, ii-1))
# Check if there are more jobs there. Only fetch jobd list when
# the string "No open positions available" is not found in the html fetched
if (length(grep("No open positions available", html_text(webpage[[ii]]), fixed = TRUE)) == 0 ) {
# the data we want is in the first table on this page
# the html_table() command coerces the data into a data frame
# Fetch job names list
jobs.list[[ii]] <- webpage[[ii]] %>%
html_nodes("table") %>%
.[[1]] %>%
html_table()
# tsting
html_nodes(
html_nodes(
html_nodes(webpage[[ii]], ".views-field-field-documento")[-1],
".file"),
"a")
# Fetch links to pdf of positions also
jobs.links[[ii]] <- webpage[[ii]] %>%
html_nodes(".views-field-field-documento") %>%
html_nodes(".file") %>%
html_nodes("a") %>%
html_attr("href")
# Merge the two data tables
jobs.list[[ii]] <- cbind(jobs.list[[ii]], jobs.links[[ii]])
colnames(jobs.list[[ii]])[4] <- "link"
}
}
require(data.table)
#n <- n+1
last.date <- format(Sys.time(), "%Y-%M-%d %X");
jobs.list.all <- rbindlist(jobs.list)
#head(jobs.list.all)
# Test some differences
#jobs.list.all.previous[2,] <- rep("foo", 4)
df1 <- data.frame(jobs.list.all.previous)
df1$link <- as.character(df1$link)
df2 <- data.frame(jobs.list.all)
df2$link <- as.character(df2$link)
if (length(all.equal(df1, df2))>0) {
#if df1 is missing (first time you run it), make a df1 like df2 but with empty values so that setdiff works
if (length(dim(df1)[0]) == 0) {
df1 <- df2[0,]
df1 <- data.frame(jobs.list.all.previous)
df1$link <- as.character(df1$link)
}
# do something, like merging the A and B into AB, and removing B from AB, or similar
jobs.new <- dplyr::setdiff(df2, df1)
jobs.new <- data.table(jobs.new)
}
# Save Rda to disk
save(last.date,
jobs.list.all,
jobs.new,
file=my.rda.file)
# Create folder if missing
folder.txts <- "TXT.BIOCAT"
if (!dir.exists(folder.txts)) {
dir.create(folder.txts)
}
# Compose the filename
outFileName.new.noext <- paste0( Sys.Date(), "_jobs.BIOCAT_list.new")
outFileName.all.noext <- paste0( Sys.Date(), "_jobs.BIOCAT_list.all")
outFileNames <- c(paste0(outFileName.new.noext, ".txt"),
paste0(outFileName.all.noext, ".txt"))
# Remove files of the same day if present
for (filename in outFileNames) {
if (file.exists(file.path(folder.txts, filename))) {
file.remove(file.path(folder.txts, filename))
}
}
# Write results to disk
write.table(jobs.list.all, file.path(folder.txts, paste0(outFileName.all.noext, ".txt")), quote = FALSE, sep=" | ", row.names=TRUE, append=TRUE)
write.table(jobs.new, file.path(folder.txts, paste0(outFileName.new.noext, ".txt")), quote = FALSE, sep=" | ", row.names=TRUE, append=TRUE)
# Send email with list of jobs and their urls
#from <- sprintf("<sendmailR@%s>", Sys.info()[4])
from <- "xavi@confluencia.net"
to <- "xavier.depedro@seeds4c.org"
#to <- "xdpedro@ir.vhebron.net another@example.com athird@example.com"
subject <- sprintf("[JOBS] BIOCAT: %s", Sys.Date())
body <- "See the list of new jobs (since the last email) in the first attachment, and the full list of jobs in this website in the second attachment below."
cc <- NULL
bcc <- NULL
headers <- NULL
smtp <- "" #"smtp.ir.vhebron.net"
#control <- list(smtpServer="172.18.50.10", verboseShow=TRUE)
# control <- list(smtpServer="smtp.ir.vhebron.net", verboseShow=TRUE) # List of SMTP server settings. Valid values are the possible options for sendmail_options
#sendmail(from, to, subject, body, control)
# Send email to notify everything is done
cat("\nSending the email confirming the job has been done... ")
#key part for attachments, put the body and the mime_part in a list for msg
attachmentPath.new <- file.path(getwd(), folder.txts, paste0(outFileName.new.noext, ".txt"))
attachmentPath.all <- file.path(getwd(), folder.txts, paste0(outFileName.all.noext, ".txt"))
#attachmentName <- outFileName
#attachmentObject <- mime_part(x=attachmentPath,name=attachmentName)
#bodyWithAttachment <- list(body,attachmentObject)
#body <- bodyWithAttachment
## If more than one attachment, use this syntax
#attachmentObject <- mime_part(x="subfolder/log.txt",name="log.txt")
#attachmentObject2 <- mime_part(x="subfolder/log2.txt",name="log2.txt")
#bodyWithAttachment <- list(body,attachmentObject,attachmentObject2)
command <- paste("sendEmail -f ", from, " -t ", to, " -cc ", cc, " -bcc ", bcc, " -u \"", subject,
"\" -m \"", body, "
\" -a \"", attachmentPath.new, "\" -a \"", attachmentPath.all,
"\" >> \"", attachmentPath.all, "\" ", " -o tls=no -o message-charset=utf-8 ", sep="");
# Removed the smtp setting for the time being while working in localhost on a linux machine out of firewalls
#\" -s ", smtp,
system(command);
cat("\nEmail sent.\n ")
# Call through the command line with:
#
# Rscript "/home/xavi/code/NotifyWebChanges/NotifyWebChanges.R"
#
# or
#
# R CMD BATCH "/home/xavi/code/NotifyWebChanges/NotifyWebChanges.R"
# cat NotifyWebChanges.Rout
## For cron jobs, add it for your user on a gnu/linux machine with something like:
## Add to your user's crontab with
#
# crontab -e
#
## Content to add (something like this for days from Mon to Friday at 10 a.m.):
#
## m h dom mon dow command
#0 10 * * 1,2,3,4,5 cd /home/xavi/code/webchanges/;R CMD BATCH NotifyWebChanges.R