Some useful scripts for running nutch in real life.
simple loop executes nutch commands:
# number of loops LIMIT=20 # number of pages to fetch MAXPAGES=50000 # you mail address MAIL=you@domain.com for ((a=1; a <= LIMIT ; a++)) do echo '************** start new crawl loop '$a'**************' bin/nutch generate db segments -topN $MAXPAGES > gen_$a.log 2>&1 s1=`ls -d segments/2* | tail -1` echo $s1 bin/nutch fetch $s1 > fetch_$a.log 2>&1 bin/nutch updatedb db $s1 > update_$a.log 2>&1 bin/nutch analyze db 3 > analyse_$a.log 2>&1 bin/nutch index $s1 > index_$a.log 2>&1 bin/nutch dedup segments dedup$a.tmp > dedup_$a.log 2>&1 du -hs db segments | mail -s'nutch loop $a done' $MAIL done exit 0
scripts for simple paralyzing nutch processes
The scripts can be used to distribute the nutch processes over a set of machines. It is possible to run fetching, indexing and "dedub" at the same time. However, since the web data base has a centralized architecture it is not possible to run the most time consuming tasks (segment generation, data base analysis and update) at the same time.
The scripts require that all machines share the same hard drive for example a NAS (network attached storage) but may usage of the nutch dfs would be an interesting alternative.
fetching script:
# fetcher script by stefan groschupf sg(AT)http://www.media-style.com
file=segment.done
processFile=fetch.running
doneFile=fetch.done
while [ 1 ]
do
for i in ./segments/*; do
if [ -d "$i" ]; then
searchFile=$i/$file
if [ -f $searchFile ]
then
echo "$searchFile file exist"
rm $searchFile
pFile=$i/$processFile
echo "done" >$pFile
# run command
FileName=./logs/$(date +%y_%m_%d_%H_%M_%S)_$processFile.log
echo $FileName
echo "start command" >>$FileName 2>&1
bin/nutch fetch $i >>$FileName 2>&1
echo "command done" >>$FileName 2>&1
rm $pFile
dFile=$i/$doneFile
echo "done" >$dFile
fi
fi
done
#echo "sleep for a while"
sleep 10
doneweb db script:
# webdb script by stefan groschupf sg(AT)http://www.media-style.com
file=fetch.done
processFile=webdb.running
doneFile=webdb.done
while [ 1 ]
do
for i in ./segments/*; do
if [ -d "$i" ]; then
searchFile=$i/$file
if [ -f $searchFile ]
then
echo "$searchFile file exist"
rm $searchFile
pFile=$i/$processFile
echo "done" > $pFile
# run command
FileName=./logs/$(date +%y_%m_%d_%H_%M_%S)_$processFile.log
echo $FileName
echo "start command" >>$FileName 2>&1
bin/nutch updatedb db $i
bin/nutch analyze db 2
bin/nutch generate db segments -topN 1000000
s3=`ls -d segments/2* | tail -1`
echo $s3
rm $pFile
dFile=$i/$doneFile
echo "done" >>$dFile
echo "done" >> $s3/segment.done
fi
fi
done
#echo "sleep for a while"
sleep 10
doneindexing script:
# index script by stefan groschupf sg(AT)http://www.media-style.com
file=webdb.done
processFile=indexer.running
doneFile=indexer.done
while [ 1 ]
do
for i in ./segments/*; do
if [ -d "$i" ]; then
searchFile=$i/$file
if [ -f $searchFile ]
then
echo "$searchFile file exist"
rm $searchFile
pFile=$i/$processFile
echo "done" > $pFile
# run command
FileName=./logs/$(date +%y_%m_%d_%H_%M_%S)_$processFile.log
echo $FileName
echo "start command" >>$FileName 2>&1
bin/nutch index $i >>$FileName 2>&1
echo "command done" >>$FileName 2>&1
touch /Users/myUser/nutch/uiserver/jakarta-tomcat-5.5.4/webapps/ROOT/WEB-INF/web.xml
rm $pFile
dFile=$i/$doneFile
echo "done" >>$dFile
fi
fi
done
#echo "sleep for a while"
sleep 10
donededub script:
# dedub script by stefan groschupf sg(AT)http://www.media-style.com
file=indexer.done
processFile=dedup.running
doneFile=dedup.done
while [ 1 ]
do
for i in ./segments/*; do
if [ -d "$i" ]; then
searchFile=$i/$file
if [ -f $searchFile ]
then
echo "$searchFile file exist"
rm $searchFile
pFile=$i/$processFile
echo "done" > $pFile
# run command
FileName=./logs/$(date +%y_%m_%d_%H_%M_%S)_$processFile.log
echo $FileName
echo "start command" >>$FileName 2>&1
bin/nutch dedup segments dedup.tmp >>$FileName 2>&1
rm dedup.tmp >>$FileName 2>&1
echo "command done" >>$FileName 2>&1
rm $pFile
dFile=$i/$doneFile
echo "done" >>$dFile
fi
fi
done
#echo "sleep for a while"
sleep 10
done