Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.
Comment: Update properties table based on recent Nutch 1.x (master) and 2.4

...

This list of Nutch configuration properties is intended for development. It includes deprecated properties and properties used only "internally". The list is generated from nutch-default.xml and Java sources.

Legend:
Def: defined in nutch-default.xml

Used: read or set from Java code

Temp: temporarily used to pass settings (eg, from command-line arguments) to map or reduce jobs

...

(owr.): some properties are defined in nutch-default.xml (and may be set in nutch-site.xml) but are overwritten by a command-line argument programmatically (tests and benchmarks are excluded)(test): overwritten only in tests and benchmarks, eg. via a command-line argument in some Nutch tools



1.X (master Branch)

2.x (deprecated codebase)

Property

Def.

Used

Temp.

Depr.

Def.

Used

Temp.

Depr.CrawlDBScanner

anchorIndexingFilter.regexdeduplicateXX

CrawlDBScanner.status



XX

anchorIndexingFilterany23.deduplicatecontent_typesXX(test)





any23.extractorsXX

(test)







arc.url.version-X





contentbatch.serverproxy.port



X-X

crawlcontent.genserver.delayport-X

-X

cosine.goldstandard.fileXX





crawl.datum.processor.overdue.time.limit-crawldb.url.filtersX





crawl.gen.delayXX

crawldb.url.normalizersXX

crawldb.urlinject.filter.normalizersnormalize.scopeall-X





creativecommonscrawldb.excludeurl.unlicensedfiltersXX(owr.)


X
crawldb.url.normalizersdb.default.fetch.intervalXX(owr.)




crawldb.url.normalizers.scope-X





creativecommons.exclude.unlicensed-X

-X

NUTCH-1409

X

X

NUTCH-1409

db.fetch.interval.defaultXX

(test)



XX

db.fetch.interval.maxXX

XX

db.fetch.retry.maxXX

XX

db.fetch.schedule.adaptive.dec_rateXX

XX

db.fetch.schedule.adaptive.inc_rateXX

XX

db.fetch.schedule.adaptive.max_intervalXX

XX

db.fetch.schedule.adaptive.min_intervalXX

XX

db.fetch.schedule.adaptive.sync_deltaXX

XX

db.fetch.schedule.adaptive.sync_delta_rateXX-

XX-

db.fetch.schedule.classXX

(test)



XX

db.fetch.schedule.mime.fileXX





db.ignore.externalalso.linksredirectsXX





db.ignore.external.exemptions.fileXX-





db.ignore.internalexternal.linksXX

XX

db.ignore.external.injectorlinks.overwritemodeXX





db.ignore.internal.linksXX

X-

db.injector.overwriteinjector.updateXX(testowr.)




db.maxinjector.anchor.lengthupdateXXX(owr.)




db.max.fetchanchor.intervallength



XNUTCH-1409

X

NUTCH-1409



db.max.outlink.inlinksXlengthXX





db.max.outlinks.per.pageXX

XX

db.parsemeta.to.crawldbXX

X-

db.preserve.backupXX





db.reader.stats.sort-XX
-XX
db.reader.topn-XX




db.reader.topn.min-XX




db.score.count.filteredXX

XX

db.score.injectedXX

XX

db.score.link.externalXX

XX

db.score.link.internalXX

XX

db.signature.classXX

XX

db.signature.text_profile.min_token_lenXX-

XX-

db.signature.text_profile.quant_rateXX-

X-

db.stats.score.quantilesXX





db.update.additions.allowedXX

XX

db.update.max.inlinksXX

XX

db.update.purge.404XX





db.update.purge.orphansXX





dc.language-X





domain.statistics.mode-XX
-XX
elastic.cluster



X-

elastic.host



X-

elastic.index



X-

elastic.max.bulk.docs



X-

elastic.max.bulk.size



X-

elastic.port



X-

elasticsearch.conf



-X

encodingdetector.charset.min.confidenceXX

XX

exchanges.exchanges.fileXX





fail.on.job.failure



-X

fetcher.bandwidth.targetXX





fetcher.exitbandwidth.target.check.everyNSecsXX





fetcher.filter.urlsXX





fetcher.follow.outlinks.depthXX





fetcher.follow.outlinks.depth.divisorXX-





fetcher.follow.outlinks.ignore.externalXX-





fetcher.follow.outlinks.num.linksXX-





fetcher.job.resume



-X

fetcher.maxjob.crawl.delay

X

X

sitemap



-XX

fetcher.maxjob.exceptionssitemap.per.queuedetect



X-X

X



fetcher.max.crawl.parsedelayXX

(test)



XX

fetcher.queuemax.exceptions.depthper.multiplierqueueXX-

XX-

fetcher.queuemaxNum.modethreadsXX

X

X







fetcher.queuemin.usecrawl.host.settingsdelayXX





fetcher.server.delay

X

Xnormalize.urlsXX





fetcher.server.min.delayparseXX

XX

fetcher.store.contentpublisherXX





fetcher.queue.depth.multiplierXX

XX

fetcher.threadsqueue.fetchmodeXX(owr.)

XX

fetcher.threadsqueue.peruse.host.settings

NUTCH-1409





XXNUTCH-1409

fetcher.threads.per.host.by.ipredirect.dedupcache.secondsXX





fetcher.redirect.dedupcache.sizeXX





fetcher.threadsserver.per.queuedelayXX

XX

fetcher.threadsserver.timeoutmin.divisordelayXX

fetcher.throughput.threshold.check.after



XX

(owrfetcher.)signatureXX





fetcher.throughputstore.threshold.pagescontentXX

XX

fetcher.throughputstore.threshold.retriesrobotstxtXX





fetcher.throughputthreads.threshold.sequencefetchXXfetcher(owr.timelimit)
XXX

fetcher.threads.per.host.by.ip



-X

fetcher.threads.timelimitper.minsqueueXX

XX

fetcher.threads.timeout.verbosedivisorXX

X







file.content.ignored

X

X

file.content.limitfetcher.throughput.threshold.check.afterXX(testowr.)
X

X

(test)

-

fetcher.throughput.threshold.pagesfile.crawl.parentXX

XX-

freefetcher.throughput.generatorthreshold.filterretriesXX





freefetcher.throughput.generatorthreshold.normalizesequence



X-

fetcher.timelimit-ftp.content.limitXX
-XX
ftpfetcher.followtimelimit.talkminsXX

XX

ftpfetcher.keep.connectionverbose



X

X

X

X

-

file.content.ignoredftp.passwordXX-

XX-

ftpfile.servercontent.timeoutlimitXX(owr.)
XX

ftpfile.crawl.timeoutparentXX

XXftp

.usernamefile.crawl.redirect_noncanonicalXX-

XX-

generatefree.batchgenerator.idfilter-X





free.generator.normalize-X





ftp.content.limitgenerate.count.modeXX

XX

generateftp.follow.curTimetalkXX

generate.filter



XX

generateftp.maxkeep.countconnectionXX

XX

generateftp.max.distancepasswordXX

generate.max.num.segmentsXX

generateftp.maxserver.per.hosttimeoutXX

NUTCH-1409

NUTCH-1409

generate.max.per.host.by.ip

X

NUTCH-1409

NUTCH-1409



XX

ftp.timeoutXX

XX

ftp.usernamegenerate.min.intervalXX

generate.min.score



XX

X



generate.batch.normaliseid



X-X

generate.partition.seedcount



-X

generate.restrictcount.statusmodeXgenerate.topNX

XX

generate.update.crawldbcurTimeX-XX

-X

generate.expr-hostdb.concurrency.levelX





hostdbgenerate.lru.sizefetch.delay.exprXX





htmlparsefiltergenerate.orderfilterX-XX

-X

httpgenerate.accepthostdbXX





X

X

http.accept.languagegenerate.max.countXX

XX

httpgenerate.max.agentcount.descriptionexprXX





generate.max.distance



XX

httpgenerate.max.agentnum.emailsegments-XX





generate.min.intervalXX





httpgenerate.agentmin.hostscoreXX

XX

generate.normalise-http.agent.nameX

-X

(test)



generate.partition.seed



-XX

(test)

http.agent.urlgenerate.restrict.statusXX





generate.sitemap



X-X

httpgenerate.agent.versiontopNX-XX

-X

httpgenerate.authupdate.filecrawldbXX

XX

httpgora.buffer.authread.verboselimit



XX-

httpgora.buffer.contentwrite.limit



X-

hbase.indexer.commit.size



X-

hbase.indexer.mapping.file



X-

hbase.indexer.zookeeper.property.clientPort



X-

hbase.indexer.zookeeper.quorum



X-

headingsX-





headings.multivaluedhttp.max.delaysXX





httphostdb.proxycheck.host

X

X

(test)

X

X

(test)

failedXX





hostdb.check.knownXX





hostdb.check.newXX





hostdb.concurrency.level



-X

hostdb.crawldatum.processorsXX





hostdb.dump.field.header-X





hostdb.dump.homepages-X





hostdb.dump.hostnames-X





hostdb.filter.expression-X





hostdb.force.checkXX





hostdb.lru.size



-X

hostdb.num.resolvers.threadsXX





hostdb.numeric.fieldsXX





hostdb.percentilesXX





hostdb.purge.failed.hosts.thresholdXX





hostdb.reading.crawldb-XX




hostdb.recheck.intervalXX





hostdb.string.fieldsXX





hostdb.url.filterXX





hostdb.url.normalizeXX





htmlparsefilter.orderXX

XX

htmlunit.enable.cssXX





htmlunit.enable.javascriptXX





htmlunit.javascript.timeoutXX





http.acceptXX

XX

http.accept.charsetXX

XX

http.accept.languageXX

XX

http.agent.descriptionXX

XX

http.agent.emailXX

XX

http.agent.hostXX

XX

http.agent.host.cookie.fileXX





http.agent.nameXX(owr.)
XX

http.agent.rotateXX

XX

http.agent.rotate.fileXX

XX

http.agent.urlXX

XX

http.agent.versionXX

XX

http.auth.fileXX

XX

http.auth.verbose-X

-X

http.content.limitXX(owr.)
XX

http.content.truncated-X





http.content.truncated.reason-X





http.enable.cookie.headerXX





http.enable.if.modified.since.headerXX





http.log.exceptions.suppress.stackXX





http.max.delays



X-

http.partial.truncatedXX





http.proxy.exception.listXX





http.proxy.hostXX

XX

http.proxy.passwordXX

XX

http.proxy.portXX

XX

http.proxy.realmXX

XX

http.proxy.typeXX





http.proxy.usernameXX

XX

http.redirect.maxXX





http.redirect.max.exceeded.skipXX





http.robot.rules.whitelistXX





http.robots.403.allowXX

XX

http.robots.agentsXX(owr.)
XX

http.store.responsetimeXX

XX

http.time.limitXX





http.timeoutXX

XX

http.tls.certificates.checkXX





http.tls.supported.cipher.suites-X

-X

http.tls.supported.protocols-X

-X

http.useHttp11XX

XX

http.useHttp2XX





http.verbose



XX

index.content.mdXX





index.db.mdXX





index.geoip.licensekeyXX





index.geoip.usageXX





index.geoip.useridXX





index.jexl.filterXX





index.links.hosts.onlyX-





index.links.inlinks.host.ignoreX-





index.links.outlinks.host.ignoreX-





index.metadata



XX

index.metadata.multivalued.fields-X





index.metadata.separatorXX





index.parse.mdXX





index.replace.regexpXX





index.staticXX





index.static.fieldsepXX





index.static.keysepXX





index.static.valuesepXX





indexer.add.domainXX





indexer.additional.params-X





indexer.binary.base64-X





indexer.delete-X





indexer.delete.robots.noindexXX





indexer.delete.skipped.by.indexingfilterXX





indexer.indexwriters.fileXX





indexer.max.content.lengthXX





indexer.max.title.lengthXX

XX

indexer.nocommit-X





indexer.score.powerXX

XX

indexer.skip.notmodifiedXX





indexer.url.filters-XX


X
indexer.url.normalizers-X





indexingfilter.orderXX

XX

injector.current.time-XX
-XX
interactiveselenium.handlersXX





io.file.buffer.size-X





io.serializationsX-

X-

jsoup.extractor.property.file



XX

lang.analyze.max.lengthXX

X-

lang.extraction.policyXX

XX

lang.identification.only.certainXX

XX

lang.index.languagesXX





lang.ngram.max.length



X-

lang.ngram.min.length



X-

libselenium.page.load.delay-X





link.analyze.damping.factorXX





link.analyze.initial.scoreXX





link.analyze.iteration-XX




link.analyze.normalize.score-X

-X

link.analyze.num.iterationsXX





link.analyze.rank.one-XX




link.delete.goneXX





link.ignore.internal.domainXX





link.ignore.internal.hostXX





link.ignore.limit.domainXX





link.ignore.limit.pageXX





link.score.updater.clear.scoreXX





linkdb.ignore.external.linksXX





linkdb.ignore.internal.linksXX





linkdb.max.anchor.lengthXX





linkdb.max.inlinksXX





linkdb.regex-XX




linkdb.url.filters-XX


X
linkdb.url.normalizer-X





linkdb.url.normalizer.scope-X





metatag.description-X





metatag.keyword-X





metatag.keywords-X





metatags.namesXX

XX

mime.type.magicXX

XX

mime.types.fileXX

XX

mimetype.filter.fileXX





moreIndexingFilter.indexMimeTypePartsXX

XX

moreIndexingFilter.mapMimeTypesXX





moreIndexingFilter.mapMimeTypes.fieldXX





nutch.conf.uuid-X

-X

nutch.fetch.time-X





org.apache.nutch.webui



-X

page.load.delayXX





parse.filter.urlsXX(owr.)




parse.job.force



-X

parse.job.resume



-X

parse.normalize.urlsXX(owr.)




parse.plugin.filehttp.proxy.passwordXX

XX

httpparse.proxy.portsitemap



X-X(test)

parsefilter.naivebayes.trainfileXX

(test)







http.proxy.realmparsefilter.naivebayes.wordlistXX

X

X

http.proxy.username







parsefilter.regex.file-XX

X

X







parsefilter.regex.rules-

http.redirect.max

XX





httpparser.robotscaching.403forbidden.allowpolicyXX

XX

httpparser.character.robotsencoding.agentsdefaultXX(test)

XX

(test)



http.timeoutparser.html.form.use_actionXX

XX

httpparser.html.useHttp11implXX

XXhttp

.verboseparser.html.line.separatorsXX





parser.html.outlinks.htmlnode_metadata_nameXX





indexparser.html.contentoutlinks.mdignore_tagsXX

index.db.md



XX

index.parse.mdparser.html.outlinks.max.target.length



XX

(test)



index.replace.regexpparser.skip.truncatedXXindex.static

XX

indexerparser.addstore.domaintextXX





indexerparser.deletetimeoutX

indexer.delete.robots.noindex

Xindexer.max.content.length

XX

indexerpartition.maxurl.title.lengthmodeXX

XX

(test)



partition.url.seed-indexer.score.powerXX
-X

plugin.auto-activationXindexer.skip.notmodifiedX

XX

indexerplugin.url.filtersexcludesXX

X

indexer.url.normalizers

X

indexerplugin.writer.classesfoldersXX

XX

indexingfilterplugin.orderincludesXX

XX

injectorpreferred.currentschema.timename





X
publisher.orderX-





rabbitmq.publisher.bindingXX





langrabbitmq.analyzepublisher.maxbinding.lengthargumentsXX

X







lang.extraction.policyrabbitmq.publisher.exchange.nameXX





rabbitmq.publisher.exchange.optionsXX





langrabbitmq.identificationpublisher.onlyheaders.certainstaticXX





rabbitmq.publisher.queue.nameXX





langrabbitmq.ngrampublisher.maxqueue.lengthoptionsX

lang.ngram.min.length

X





linkrabbitmq.analyzepublisher.damping.factorroutingkeyXX





linkrabbitmq.analyzepublisher.initialserver.scoreuriXX





linkrestapi.analyze.iterationauth



XX

linkrestapi.analyzeauth.normalizessl.scorekeypass



XX

linkrestapi.analyzeauth.numssl.iterationsstorepass



XX

linkrestapi.analyzeauth.rankssl.onestorepath



XX

linkrestapi.deleteauth.goneusers



XX

linkscoring.ignorecontent.internal.domainmdXX





linkscoring.ignoredb.internal.hostmdXX





linkscoring.ignoredepth.limit.domainmaxXX





linkscoring.ignorefilter.limit.pageorderX

X

link.loops.depth-

XX

linkscoring.scoreorphan.updatermark.cleargone.scoreafterXX





linkdb.url.filtersscoring.orphan.mark.orphan.afterXX





X

linkdbscoring.urlparse.normalizermdXX





linkdbscoring.urlsimilarity.normalizer.scopemodelXmetatag.descriptionX

metatag.keywords

X







metatags.namesscoring.similarity.ngramsXX

(test)







mime.type.magicscoring.similarity.stopword.fileXX





screenshot.locationXX





mimesegment.typesdump.filedir-XX





segment.merger.filter-XX




segment.merger.normalizer-moreIndexingFilter.indexMimeTypePartsXX




segment.merger.segmentName-(test)XX

(test)






segment.merger.slice-moreIndexingFilter.mapMimeTypesXX




nutchsegment.confproxy.uuidportX-X





parsesegment.reader.filtercontent.urlsrecodeXX(owr.)parse.




selenium.driverXjob.forceX





parseselenium.jobenable.resumeheadlessXX





parseselenium.firefox.normalizeallowed.urlshostsXX-

(owr.)







parse.plugin.fileselenium.firefox.binary.timeoutX-





selenium.firefox.enable.flash

X

(test)X

X

(test)

-





selenium.firefox.load.imageparser.caching.forbidden.policyX

X

X

X

parser.character.encoding.default

-





selenium.firefox.load.stylesheetX-





selenium.grid.binary

X

XXX





parserselenium.fixgrid.embeddedparamsdriverXX





parserselenium.html.form.use_actionhub.hostXX(test)





selenium.hub.pathXX





(test)

parser.html.impl

Xselenium.hub.portXX

X







parserselenium.html.outlinks.ignore_tagshub.protocolXX





sftp.password



X-X

sftp.port



-parser.skip.truncatedXX

sftp.server



X-X

parsersftp.timeoutuser



-XX

sitemap.content.limit



XX-

partitionsitemap.urlparser.modetimeout



XX

sitemap.redir.maxXX





partitionsitemap.urlsize.seedmaxXX

X







plugin.auto-activationsitemap.strict.parsingXX





sitemap.url.default.sitemap.xmlXX





pluginsitemap.url.excludesfilterXX





sitemap.url.normalizeXXplugin





.folderssitemap.url.overwrite.existingXX





solr.auth



XX

pluginsolr.auth.includespassword



X-X

(test)



solr.auth.username



-XX

(test)



solr.commit.index



Xschema.prefixX

scoringsolr.filtercommit.ordersize



XX

solr.mapping.file



XX

segmentsolr.dumpserver.dirurl



-X

segmentstorage.mergercrawl.filterid



XX

segmentstorage.data.mergerstore.normalizerclass



XX-

segmentstorage.mergerschema.segmentNamehost



XX

segmentstorage.mergerschema.slicewebpage



XX

segmentstore.proxyhttp.portheadersXX





segmentstore.readerhttp.corequestXX





segmentstore.reader.feip.addressXX

XX

segmentsubcollection.readercase.geinsensitiveXX





segmentsubcollection.readerdefault.pafieldnameXX





segmentsubcollection.readermetadata.pdsourceX-X





subcollections.config-segment.reader.ptX

-X

sftpsubcollections.passwordxml-X

sftp.port-X

sftptake.serverscreenshotXsftp.userX





solrtika.authboilerpipe



XX

solrtika.authboilerpipe.passwordextractor



X

solr.auth.username

X

solrtika.commitconfig.indexfileXX





tika.extractorXX





solrtika.extractor.commitboilerpipe.sizealgorithmXX





tika.extractor.boilerpipe.mime.typesXX





solrtika.mappinghtmlmapper.fileclassnameXX

XX

tika.parse.embeddedXsolr.paramsX





solrtika.uppercase.serverelement.urlnamesXX





storageurlfilter.crawl.idautomaton.fileXX

XX

storageurlfilter.data.store.classautomaton.rules-X

-X

(test)



urlfilter.domain.fileXXstorage.schema.host

XX

storageurlfilter.schemadomain.webpagerules-X

-X

subcollectionurlfilter.defaultdomaindenylist.fieldfile-X





subcollectionurlfilter.defaultdomaindenylist.fieldnamerules-X





subcollectionsurlfilter.fast.configfileXX





subcollectionsurlfilter.xmlorderXX

tika.config.fileXX

urlfilter.automatonprefix.fileXX

XX

urlfilter.automatonprefix.rules-X

-X

urlfilter.domainregex.fileXX

XX

urlfilter.domainregex.rules-X

-X

urlfilter.domainblacklistsuffix.fileXX

Xurlfilter.domainblacklist.rulesX

urlfilter.suffix.orderrulesX-XX

-X

urlfilter.prefixtld.filelength



XX

urlmeta.tagsXX





urlfilterurlnormalizer.basic.prefixhost.rulesidnX-





urlnormalizer.basic.host.trim-trailing-dotX-





urlfilterurlnormalizer.regexhosts.fileX-X

X

X







urlfilter.regexurlnormalizer.hosts.rulesX-X





urlfilterurlnormalizer.suffixloop.filecountXX

(test)



XX

(test)



urlfilter.suffix.rulesurlnormalizer.orderXX

urlmeta.tags



XX

urlnormalizer.hostsprotocols.fileXX





urlnormalizer.hostsprotocols.rulesXX





urlnormalizer.loopregex.countfileXX

XX

urlnormalizer.regex.orderrules-X

-X

(test)



urlnormalizer.slashes.file-XX

(test)







urlnormalizer.regexslashes.filerules-X





warc.exporter.only.successful.responses-XX





warc.file.size.max-X





urlnormalizerwebdriver.regexchrome.rulesdriverXX-





webgraph.url.filters-XX


X
webgraph.url.normalizers-X





webgui.auth.users



XX

webtable.dump.content



-X

webtable.dump.headers



-X

webtable.dump.links



-X

webtable.dump.text



-X

webtable.url.regex



-X

back to FrontPage