forked from kaqqao/nutch-windows-script
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl.bat
235 lines (188 loc) · 7.17 KB
/
crawl.bat
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
@echo off
rem
rem Licensed to the Apache Software Foundation (ASF) under one or more
rem contributor license agreements. See the NOTICE file distributed with
rem this work for additional information regarding copyright ownership.
rem The ASF licenses this file to You under the Apache License, Version 2.0
rem (the "License"); you may not use this file except in compliance with
rem the License. You may obtain a copy of the License at
rem
rem http://www.apache.org/licenses/LICENSE-2.0
rem
rem Unless required by applicable law or agreed to in writing, software
rem distributed under the License is distributed on an "AS IS" BASIS,
rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
rem See the License for the specific language governing permissions and
rem limitations under the License.
rem
rem The Crawl command script : crawl <seedDir> <crawlDir> <solrURL> <numberOfRounds>
rem
rem
rem UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK INVERSION AND
rem INDEXING FOR EACH SEGMENT
setlocal
set SEEDDIR=%~1
set CRAWL_PATH=%~2
set SOLRURL=%3
set LIMIT=%4
set NUTCH_OPTS_ENV=%NUTCH_OPTS%
if "%SEEDDIR%" == "" (
echo Missing seedDir : crawl ^<seedDir^> ^<crawlDir^> ^<solrURL^> ^<numberOfRounds^>
exit /B 1;
)
if "%CRAWL_PATH%" == "" (
echo Missing crawlDir : crawl ^<seedDir^> ^<crawlDir^> ^<solrURL^> ^<numberOfRounds^>
exit /B 1;
)
if "%SOLRURL%" == "" (
echo Missing SOLRURL : crawl ^<seedDir^> ^<crawlDir^> ^<solrURL^> ^<numberOfRounds^>
exit /B 1;
)
if "%LIMIT%" == "" (
echo Missing numberOfRounds : crawl ^<seedDir^> ^<crawlDir^> ^<solrURL^> ^<numberOfRounds^>
exit /B 1;
)
rem #############################################
rem # MODIFY THE PARAMETERS BELOW TO YOUR NEEDS #
rem #############################################
rem set the number of slaves nodes
set numSlaves=1
rem and the total number of available tasks
rem sets Hadoop parameter "mapred.reduce.tasks"
set /a numTasks=%numSlaves% * 2
rem number of urls to fetch in one iteration
rem 250K per task?
set /a sizeFetchlist=%numSlaves% * 50000
rem time limit for feching
set timeLimitFetch=180
rem num threads for fetching
set numThreads=50
echo numSlaves=%numSlaves%
echo numTasks=%numTasks%
echo sizeFetchlist=%sizeFetchlist%
echo timeLimitFetch=%timeLimitFetch%
echo numThreads=%numThreads%
rem #############################################
rem determines whether mode based on presence of job file
set NUTCH_JOB=
for %%i in ("%NUTCH_HOME%\*nutch*.job") do set NUTCH_JOB=%%i
SET BIN=%~dp0
rem note that some of the options listed here could be set in the
rem corresponding hadoop site xml param file
SET commonOptions=-Dmapred.reduce.tasks=%numTasks% -Dmapred.child.java.opts=-Xmx1000m -Dmapred.reduce.tasks.speculative.execution=false -Dmapred.map.tasks.speculative.execution=false -Dmapred.compress.map.output=true
rem check that hadoop can be found on the path
if defined NUTCH_JOB (
rem Set errorlevel here because of variable expansion happens before block execution
WHERE hadoop.cmd >nul 2>&1
IF %ERRORLEVEL% NEQ 0 (
echo Can't find Hadoop executable. Add HADOOP_HOME/bin to the path or run in local mode.
exit /B 1
)
)
rem initial injection
SET NUTCH_OPTS=%NUTCH_OPTS_ENV%
call %bin%nutch.bat inject "%CRAWL_PATH%/crawldb" "%SEEDDIR%"
if %ERRORLEVEL% NEQ 0 (
echo Errorlevel from previous command is %ERRORLEVEL% - crawling stopped.
exit /B %ERRORLEVEL%
)
rem main loop : rounds of generate - fetch - parse - update
setlocal ENABLEDELAYEDEXPANSION
for /L %%i IN (1,1,%LIMIT%) do (
if exist *.STOP (
echo STOP file found - escaping loop
goto break
)
echo %DATE% %TIME% : Iteration %%i of %LIMIT%
echo Generating a new segment
SET NUTCH_OPTS=%NUTCH_OPTS_ENV% %commonOptions%
call %bin%nutch.bat generate "%CRAWL_PATH%/crawldb" "%CRAWL_PATH%/segments" -topN %sizeFetchlist% -numFetchers %numSlaves% -noFilter
if !ERRORLEVEL! NEQ 0 (
echo Errorlevel from previous command is !ERRORLEVEL! - crawling stopped.
exit /B !ERRORLEVEL!
)
rem capture the name of the segment
rem call hadoop in distributed mode
rem or use ls
SET SEGMENT=
if not defined NUTCH_JOB (
rem alternatively, the creation date can be used: /O-D /TC instead of /O-N
rem by using double quotes around the path it is allowed to contain forward slashes
for /F %%s IN ('dir "%CRAWL_PATH%\segments" /O-N /AD /B') do (
if not defined SEGMENT (
set SEGMENT=%%s
)
)
) else (
rem not yet rewritten for hdfs usage on windows
echo Not yet rewritten for hdfs usage on windows, stopping ...
exit /B 1
rem set SEGMENT=`hadoop fs -ls %CRAWL_PATH%/segments/ | grep segments | sed -e "s/\//\\n/g" | egrep 20[0-9]+ | sort -n | tail -n 1`
for /F %%s IN ('hdfs dfs -ls %CRAWL_PATH%/segments/' ^| findstr "segments/") do (
if not defined SEGMENT (
set SEGMENT=%%s
)
)
)
echo Operating on segment : !SEGMENT!
rem fetching the segment
echo Fetching : !SEGMENT!
SET NUTCH_OPTS=%NUTCH_OPTS_ENV% %commonOptions% -Dfetcher.timelimit.mins=%timeLimitFetch%
call %bin%nutch fetch "%CRAWL_PATH%/segments/!SEGMENT!" -noParsing -threads %numThreads%
if !ERRORLEVEL! NEQ 0 (
echo Errorlevel from previous command is !ERRORLEVEL! - crawling stopped.
exit /B !ERRORLEVEL!
)
rem parsing the segment
echo Parsing : !SEGMENT!
rem enable the skipping of records for the parsing so that a dodgy document
rem so that it does not fail the full task
SET NUTCH_OPTS=%NUTCH_OPTS_ENV% %commonOptions% -Dmapred.skip.attempts.to.start.skipping=2 -Dmapred.skip.map.max.skip.records=1
call %bin%nutch parse "%CRAWL_PATH%/segments/!SEGMENT!"
if !ERRORLEVEL! NEQ 0 (
echo Errorlevel from previous command is !ERRORLEVEL! - crawling stopped.
exit /B !ERRORLEVEL!
)
rem updatedb with this segment
echo CrawlDB update
SET NUTCH_OPTS=%NUTCH_OPTS_ENV% %commonOptions%
call %bin%nutch updatedb "%CRAWL_PATH%/crawldb" "%CRAWL_PATH%/segments/!SEGMENT!"
if !ERRORLEVEL! NEQ 0 (
echo Errorlevel from previous command is !ERRORLEVEL! - crawling stopped.
exit /B !ERRORLEVEL!
)
rem note that the link inversion - indexing routine can be done within the main loop
rem on a per segment basis
echo Link inversion
SET NUTCH_OPTS=%NUTCH_OPTS_ENV%
call %bin%nutch invertlinks "%CRAWL_PATH%/linkdb" "%CRAWL_PATH%/segments/!SEGMENT!"
if !ERRORLEVEL! NEQ 0 (
echo Errorlevel from previous command is !ERRORLEVEL! - crawling stopped.
exit /B !ERRORLEVEL!
)
echo Dedup on crawldb
SET NUTCH_OPTS=%NUTCH_OPTS_ENV%
call %bin%nutch dedup "%CRAWL_PATH%/crawldb"
if !ERRORLEVEL! NEQ 0 (
echo Errorlevel from previous command is !ERRORLEVEL! - crawling stopped.
exit /B !ERRORLEVEL!
)
echo Indexing !SEGMENT! on SOLR index -^> %SOLRURL%
SET NUTCH_OPTS=%NUTCH_OPTS_ENV%
call %bin%nutch index "-Dsolr.server.url=%SOLRURL%" "%CRAWL_PATH%/crawldb" -linkdb "%CRAWL_PATH%/linkdb" "%CRAWL_PATH%/segments/!SEGMENT!"
if !ERRORLEVEL! NEQ 0 (
echo Errorlevel from previous command is !ERRORLEVEL! - crawling stopped.
exit /B !ERRORLEVEL!
)
echo Cleanup on SOLR index -^> %SOLRURL%
SET NUTCH_OPTS=%NUTCH_OPTS_ENV%
call %bin%nutch clean "-Dsolr.server.url=%SOLRURL%" "%CRAWL_PATH%/crawldb"
if !ERRORLEVEL! NEQ 0 (
echo Errorlevel from previous command is !ERRORLEVEL! - crawling stopped.
exit /B !ERRORLEVEL!
)
)
endlocal
:break
endlocal
exit /B 0