List of Nutch Properties

This list of Nutch configuration properties is intended for development. It includes deprecated properties and properties used only "internally". The list is generated from nutch-default.xml and Java sources.

Legend:
Def: defined in nutch-default.xml

Used: read or set from Java code

Temp: temporarily used to pass settings (eg, from command-line arguments) to map or reduce jobs

Depr.: deprecated

(owr.): some properties are defined in nutch-default.xml (and may be set in nutch-site.xml) but are overwritten programmatically (tests and benchmarks are excluded), eg. via a command-line argument in some Nutch tools


1.X (master Branch)

2.x (deprecated codebase)

Property

Def.

Used

Temp.

Depr.

Def.

Used

Temp.

Depr.

anchorIndexingFilter.deduplicateXX

XX

any23.content_typesXX





any23.extractorsXX





arc.url.version-X





batch.proxy.port



-X

content.server.port-X

-X

cosine.goldstandard.fileXX





crawl.datum.processor.overdue.time.limit-X





crawl.gen.delayXX

XX

crawldb.inject.filter.normalize.all-X





crawldb.url.filtersXX(owr.)


X
crawldb.url.normalizersXX(owr.)




crawldb.url.normalizers.scope-X





creativecommons.exclude.unlicensed-X

-X

db.fetch.interval.defaultXX

XX

db.fetch.interval.maxXX

XX

db.fetch.retry.maxXX

XX

db.fetch.schedule.adaptive.dec_rateXX

XX

db.fetch.schedule.adaptive.inc_rateXX

XX

db.fetch.schedule.adaptive.max_intervalXX

XX

db.fetch.schedule.adaptive.min_intervalXX

XX

db.fetch.schedule.adaptive.sync_deltaXX

XX

db.fetch.schedule.adaptive.sync_delta_rateX-

X-

db.fetch.schedule.classXX

XX

db.fetch.schedule.mime.fileXX





db.ignore.also.redirectsXX





db.ignore.external.exemptions.fileX-





db.ignore.external.linksXX

XX

db.ignore.external.links.modeXX





db.ignore.internal.linksXX

X-

db.injector.overwriteXX(owr.)




db.injector.updateXX(owr.)




db.max.anchor.length



X-

db.max.outlink.lengthXX





db.max.outlinks.per.pageXX

XX

db.parsemeta.to.crawldbXX

X-

db.preserve.backupXX





db.reader.stats.sort-XX
-XX
db.reader.topn-XX




db.reader.topn.min-XX




db.score.count.filteredXX

XX

db.score.injectedXX

XX

db.score.link.externalXX

XX

db.score.link.internalXX

XX

db.signature.classXX

XX

db.signature.text_profile.min_token_lenX-

X-

db.signature.text_profile.quant_rateX-

X-

db.stats.score.quantilesXX





db.update.additions.allowedXX

XX

db.update.max.inlinksXX

XX

db.update.purge.404XX





db.update.purge.orphansXX





dc.language-X





domain.statistics.mode-XX
-XX
elastic.cluster



X-

elastic.host



X-

elastic.index



X-

elastic.max.bulk.docs



X-

elastic.max.bulk.size



X-

elastic.port



X-

elasticsearch.conf



-X

encodingdetector.charset.min.confidenceXX

XX

exchanges.exchanges.fileXX





fail.on.job.failure



-X

fetcher.bandwidth.targetXX





fetcher.bandwidth.target.check.everyNSecsXX





fetcher.filter.urlsXX





fetcher.follow.outlinks.depthXX





fetcher.follow.outlinks.depth.divisorX-





fetcher.follow.outlinks.ignore.externalX-





fetcher.follow.outlinks.num.linksX-





fetcher.job.resume



-X

fetcher.job.sitemap



-X

fetcher.job.sitemap.detect



-X

fetcher.max.crawl.delayXX

XX

fetcher.max.exceptions.per.queueX-

X-

fetcher.maxNum.threadsXX





fetcher.min.crawl.delayXX





fetcher.normalize.urlsXX





fetcher.parseXX

XX

fetcher.publisherXX





fetcher.queue.depth.multiplierXX

XX

fetcher.queue.modeXX

XX

fetcher.queue.use.host.settings



XX

fetcher.redirect.dedupcache.secondsXX





fetcher.redirect.dedupcache.sizeXX





fetcher.server.delayXX

XX

fetcher.server.min.delayXX

XX

fetcher.signatureXX





fetcher.store.contentXX

XX

fetcher.store.robotstxtXX





fetcher.threads.fetchXX(owr.)
XX

fetcher.threads.per.host.by.ip



-X

fetcher.threads.per.queueXX

XX

fetcher.threads.timeout.divisorXX





fetcher.throughput.threshold.check.afterXX(owr.)
X-

fetcher.throughput.threshold.pagesXX

X-

fetcher.throughput.threshold.retriesXX





fetcher.throughput.threshold.sequence



X-

fetcher.timelimit-XX
-XX
fetcher.timelimit.minsXX

XX

fetcher.verbose



X-

file.content.ignoredX-

X-

file.content.limitXX(owr.)
XX

file.crawl.parentXX

XX

file.crawl.redirect_noncanonicalX-

X-

free.generator.filter-X





free.generator.normalize-X





ftp.content.limitXX

XX

ftp.follow.talkXX

XX

ftp.keep.connectionXX

XX

ftp.passwordXX

XX

ftp.server.timeoutXX

XX

ftp.timeoutXX

XX

ftp.usernameXX

XX

generate.batch.id



-X

generate.count



-X

generate.count.modeXX

XX

generate.curTime-X

-X

generate.expr-X





generate.fetch.delay.exprXX





generate.filter-X

-X

generate.hostdbXX





generate.max.countXX

XX

generate.max.count.exprXX





generate.max.distance



XX

generate.max.num.segments-X





generate.min.intervalXX





generate.min.scoreXX

XX

generate.normalise-X

-X

generate.partition.seed



-X

generate.restrict.statusXX





generate.sitemap



-X

generate.topN-X

-X

generate.update.crawldbXX

XX

gora.buffer.read.limit



X-

gora.buffer.write.limit



X-

hbase.indexer.commit.size



X-

hbase.indexer.mapping.file



X-

hbase.indexer.zookeeper.property.clientPort



X-

hbase.indexer.zookeeper.quorum



X-

headingsX-





headings.multivaluedXX





hostdb.check.failedXX





hostdb.check.knownXX





hostdb.check.newXX





hostdb.concurrency.level



-X

hostdb.crawldatum.processorsXX





hostdb.dump.field.header-X





hostdb.dump.homepages-X





hostdb.dump.hostnames-X





hostdb.filter.expression-X





hostdb.force.checkXX





hostdb.lru.size



-X

hostdb.num.resolvers.threadsXX





hostdb.numeric.fieldsXX





hostdb.percentilesXX





hostdb.purge.failed.hosts.thresholdXX





hostdb.reading.crawldb-XX




hostdb.recheck.intervalXX





hostdb.string.fieldsXX





hostdb.url.filterXX





hostdb.url.normalizeXX





htmlparsefilter.orderXX

XX

htmlunit.enable.cssXX





htmlunit.enable.javascriptXX





htmlunit.javascript.timeoutXX





http.acceptXX

XX

http.accept.charsetXX

XX

http.accept.languageXX

XX

http.agent.descriptionXX

XX

http.agent.emailXX

XX

http.agent.hostXX

XX

http.agent.host.cookie.fileXX





http.agent.nameXX(owr.)
XX

http.agent.rotateXX

XX

http.agent.rotate.fileXX

XX

http.agent.urlXX

XX

http.agent.versionXX

XX

http.auth.fileXX

XX

http.auth.verbose-X

-X

http.content.limitXX(owr.)
XX

http.content.truncated-X





http.content.truncated.reason-X





http.enable.cookie.headerXX





http.enable.if.modified.since.headerXX





http.log.exceptions.suppress.stackXX





http.max.delays



X-

http.partial.truncatedXX





http.proxy.exception.listXX





http.proxy.hostXX

XX

http.proxy.passwordXX

XX

http.proxy.portXX

XX

http.proxy.realmXX

XX

http.proxy.typeXX





http.proxy.usernameXX

XX

http.redirect.maxXX





http.redirect.max.exceeded.skipXX





http.robot.rules.whitelistXX





http.robots.403.allowXX

XX

http.robots.agentsXX(owr.)
XX

http.store.responsetimeXX

XX

http.time.limitXX





http.timeoutXX

XX

http.tls.certificates.checkXX





http.tls.supported.cipher.suites-X

-X

http.tls.supported.protocols-X

-X

http.useHttp11XX

XX

http.useHttp2XX





http.verbose



XX

index.content.mdXX





index.db.mdXX





index.geoip.licensekeyXX





index.geoip.usageXX





index.geoip.useridXX





index.jexl.filterXX





index.links.hosts.onlyX-





index.links.inlinks.host.ignoreX-





index.links.outlinks.host.ignoreX-





index.metadata



XX

index.metadata.multivalued.fields-X





index.metadata.separatorXX





index.parse.mdXX





index.replace.regexpXX





index.staticXX





index.static.fieldsepXX





index.static.keysepXX





index.static.valuesepXX





indexer.add.domainXX





indexer.additional.params-X





indexer.binary.base64-X





indexer.delete-X





indexer.delete.robots.noindexXX





indexer.delete.skipped.by.indexingfilterXX





indexer.indexwriters.fileXX





indexer.max.content.lengthXX





indexer.max.title.lengthXX

XX

indexer.nocommit-X





indexer.score.powerXX

XX

indexer.skip.notmodifiedXX





indexer.url.filters-XX


X
indexer.url.normalizers-X





indexingfilter.orderXX

XX

injector.current.time-XX
-XX
interactiveselenium.handlersXX





io.file.buffer.size-X





io.serializationsX-

X-

jsoup.extractor.property.file



XX

lang.analyze.max.lengthXX

X-

lang.extraction.policyXX

XX

lang.identification.only.certainXX

XX

lang.index.languagesXX





lang.ngram.max.length



X-

lang.ngram.min.length



X-

libselenium.page.load.delay-X





link.analyze.damping.factorXX





link.analyze.initial.scoreXX





link.analyze.iteration-XX




link.analyze.normalize.score-X

-X

link.analyze.num.iterationsXX





link.analyze.rank.one-XX




link.delete.goneXX





link.ignore.internal.domainXX





link.ignore.internal.hostXX





link.ignore.limit.domainXX





link.ignore.limit.pageXX





link.score.updater.clear.scoreXX





linkdb.ignore.external.linksXX





linkdb.ignore.internal.linksXX





linkdb.max.anchor.lengthXX





linkdb.max.inlinksXX





linkdb.regex-XX




linkdb.url.filters-XX


X
linkdb.url.normalizer-X





linkdb.url.normalizer.scope-X





metatag.description-X





metatag.keyword-X





metatag.keywords-X





metatags.namesXX

XX

mime.type.magicXX

XX

mime.types.fileXX

XX

mimetype.filter.fileXX





moreIndexingFilter.indexMimeTypePartsXX

XX

moreIndexingFilter.mapMimeTypesXX





moreIndexingFilter.mapMimeTypes.fieldXX





nutch.conf.uuid-X

-X

nutch.fetch.time-X





org.apache.nutch.webui



-X

page.load.delayXX





parse.filter.urlsXX(owr.)




parse.job.force



-X

parse.job.resume



-X

parse.normalize.urlsXX(owr.)




parse.plugin.fileXX

XX

parse.sitemap



-X

parsefilter.naivebayes.trainfileXX





parsefilter.naivebayes.wordlistXX





parsefilter.regex.file-X





parsefilter.regex.rules-X





parser.caching.forbidden.policyXX

XX

parser.character.encoding.defaultXX

XX

parser.html.form.use_actionXX

XX

parser.html.implXX

XX

parser.html.line.separatorsXX





parser.html.outlinks.htmlnode_metadata_nameXX





parser.html.outlinks.ignore_tagsXX

XX

parser.html.outlinks.max.target.length



XX

parser.skip.truncatedXX

XX

parser.store.textXX





parser.timeoutXX

XX

partition.url.modeXX

XX

partition.url.seed-XX
-X

plugin.auto-activationXX

XX

plugin.excludesXX

XX

plugin.foldersXX

XX

plugin.includesXX

XX

preferred.schema.name





X
publisher.orderX-





rabbitmq.publisher.bindingXX





rabbitmq.publisher.binding.argumentsXX





rabbitmq.publisher.exchange.nameXX





rabbitmq.publisher.exchange.optionsXX





rabbitmq.publisher.headers.staticXX





rabbitmq.publisher.queue.nameXX





rabbitmq.publisher.queue.optionsXX





rabbitmq.publisher.routingkeyXX





rabbitmq.publisher.server.uriXX





restapi.auth



XX

restapi.auth.ssl.keypass



XX

restapi.auth.ssl.storepass



XX

restapi.auth.ssl.storepath



XX

restapi.auth.users



XX

scoring.content.mdXX





scoring.db.mdXX





scoring.depth.maxXX





scoring.filter.orderX-

XX

scoring.orphan.mark.gone.afterXX





scoring.orphan.mark.orphan.afterXX





scoring.parse.mdXX





scoring.similarity.modelXX





scoring.similarity.ngramsXX





scoring.similarity.stopword.fileXX





screenshot.locationXX





segment.dump.dir-X





segment.merger.filter-XX




segment.merger.normalizer-XX




segment.merger.segmentName-XX




segment.merger.slice-XX




segment.proxy.port-X





segment.reader.content.recodeXX(owr.)




selenium.driverXX





selenium.enable.headlessXX





selenium.firefox.allowed.hostsX-





selenium.firefox.binary.timeoutX-





selenium.firefox.enable.flashX-





selenium.firefox.load.imageX-





selenium.firefox.load.stylesheetX-





selenium.grid.binaryXX





selenium.grid.driverXX





selenium.hub.hostXX





selenium.hub.pathXX





selenium.hub.portXX





selenium.hub.protocolXX





sftp.password



-X

sftp.port



-X

sftp.server



-X

sftp.user



-X

sitemap.content.limit



X-

sitemap.parser.timeout



XX

sitemap.redir.maxXX





sitemap.size.maxXX





sitemap.strict.parsingXX





sitemap.url.default.sitemap.xmlXX





sitemap.url.filterXX





sitemap.url.normalizeXX





sitemap.url.overwrite.existingXX





solr.auth



XX

solr.auth.password



-X

solr.auth.username



-X

solr.commit.index



XX

solr.commit.size



XX

solr.mapping.file



XX

solr.server.url



-X

storage.crawl.id



XX

storage.data.store.class



X-

storage.schema.host



XX

storage.schema.webpage



XX

store.http.headersXX





store.http.requestXX





store.ip.addressXX

XX

subcollection.case.insensitiveXX





subcollection.default.fieldnameXX





subcollection.metadata.source-X





subcollections.config-X

-X

subcollections.xml-X

-X

take.screenshotXX





tika.boilerpipe



XX

tika.boilerpipe.extractor



XX

tika.config.fileXX





tika.extractorXX





tika.extractor.boilerpipe.algorithmXX





tika.extractor.boilerpipe.mime.typesXX





tika.htmlmapper.classnameXX

XX

tika.parse.embeddedXX





tika.uppercase.element.namesXX





urlfilter.automaton.fileXX

XX

urlfilter.automaton.rules-X

-X

urlfilter.domain.fileXX

XX

urlfilter.domain.rules-X

-X

urlfilter.domaindenylist.file-X





urlfilter.domaindenylist.rules-X





urlfilter.fast.fileXX





urlfilter.orderXX

XX

urlfilter.prefix.fileXX

XX

urlfilter.prefix.rules-X

-X

urlfilter.regex.fileXX

XX

urlfilter.regex.rules-X

-X

urlfilter.suffix.fileXX

XX

urlfilter.suffix.rules-X

-X

urlfilter.tld.length



XX

urlmeta.tagsXX





urlnormalizer.basic.host.idnX-





urlnormalizer.basic.host.trim-trailing-dotX-





urlnormalizer.hosts.file-X





urlnormalizer.hosts.rules-X





urlnormalizer.loop.countXX

XX

urlnormalizer.orderXX

XX

urlnormalizer.protocols.fileXX





urlnormalizer.protocols.rulesXX





urlnormalizer.regex.fileXX

XX

urlnormalizer.regex.rules-X

-X

urlnormalizer.slashes.file-X





urlnormalizer.slashes.rules-X





warc.exporter.only.successful.responses-X





warc.file.size.max-X





webdriver.chrome.driverX-





webgraph.url.filters-XX


X
webgraph.url.normalizers-X





webgui.auth.users



XX

webtable.dump.content



-X

webtable.dump.headers



-X

webtable.dump.links



-X

webtable.dump.text



-X

webtable.url.regex



-X

back to FrontPage

  • No labels