|
Size: 34115
Comment:
|
← Revision 3 as of 2012-12-12 19:52:50 ⇥
Size: 34115
Comment:
|
| Deletions are marked like this. | Additions are marked like this. |
| Line 79: | Line 79: |
| || fetcher.queue.use.host.settings || || || || || - || X || || || | || fetcher.queue.use.host.settings || || || || || X || X || || || |
List of Nutch Properties
This list of Nutch configuration properties is intended for development. It includes deprecated properties and properties used only "internally". The list is generated from nutch-default.xml and Java sources.
Legend:
- Def.
- defined in nutch-default.xml
- Used
- read or set from Java code
- Temp.
- temporarily used to pass settings (eg, from command-line arguments) to map or reduce jobs
- Depr.
- deprecated
- (owr.)
- some properties are defined in nutch-default.xml (and may be set in nutch-site.xml) but are overwritten by a command-line argument (tests and benchmarks are excluded)
- (test)
- overwritten only in tests and benchmarks
|
Trunk |
2.x |
||||||
Property |
Def. |
Used |
Temp. |
Depr. |
Def. |
Used |
Temp. |
Depr. |
CrawlDBScanner.regex |
- |
X |
X |
|
|
|
|
|
CrawlDBScanner.status |
- |
X |
X |
|
|
|
|
|
anchorIndexingFilter.deduplicate |
X |
X |
(test) |
|
X |
X |
(test) |
|
arc.url.version |
- |
X |
|
|
|
|
|
|
content.server.port |
- |
X |
|
|
- |
X |
|
|
crawl.gen.delay |
X |
X |
|
|
X |
X |
|
|
crawldb.url.filters |
- |
X |
X |
|
|
|
X |
|
crawldb.url.normalizers |
- |
X |
|
|
|
|
|
|
crawldb.url.normalizers.scope |
- |
X |
|
|
|
|
|
|
creativecommons.exclude.unlicensed |
- |
X |
|
|
- |
X |
|
|
db.default.fetch.interval |
X |
X |
|
NUTCH-1409 |
X |
X |
|
NUTCH-1409 |
db.fetch.interval.default |
X |
X |
(test) |
|
X |
X |
|
|
db.fetch.interval.max |
X |
X |
|
|
X |
X |
|
|
db.fetch.retry.max |
X |
X |
|
|
X |
X |
|
|
db.fetch.schedule.adaptive.dec_rate |
X |
X |
|
|
X |
X |
|
|
db.fetch.schedule.adaptive.inc_rate |
X |
X |
|
|
X |
X |
|
|
db.fetch.schedule.adaptive.max_interval |
X |
X |
|
|
X |
X |
|
|
db.fetch.schedule.adaptive.min_interval |
X |
X |
|
|
X |
X |
|
|
db.fetch.schedule.adaptive.sync_delta |
X |
X |
|
|
X |
X |
|
|
db.fetch.schedule.adaptive.sync_delta_rate |
X |
X |
|
|
X |
X |
|
|
db.fetch.schedule.class |
X |
X |
(test) |
|
X |
X |
|
|
db.fetch.schedule.mime.file |
X |
X |
|
|
|
|
|
|
db.ignore.external.links |
X |
X |
|
|
X |
X |
|
|
db.ignore.internal.links |
X |
X |
|
|
X |
- |
|
|
db.injector.overwrite |
X |
X |
|
|
|
|
|
|
db.injector.update |
X |
X |
(test) |
|
|
|
|
|
db.max.anchor.length |
X |
X |
|
|
X |
- |
|
|
db.max.fetch.interval |
- |
X |
|
NUTCH-1409 |
- |
X |
|
NUTCH-1409 |
db.max.inlinks |
X |
X |
|
|
X |
- |
|
|
db.max.outlinks.per.page |
X |
X |
|
|
X |
X |
|
|
db.parsemeta.to.crawldb |
X |
X |
|
|
X |
- |
|
|
db.preserve.backup |
X |
X |
|
|
|
|
|
|
db.reader.stats.sort |
- |
X |
X |
|
- |
X |
X |
|
db.reader.topn |
- |
X |
X |
|
|
|
|
|
db.reader.topn.min |
- |
X |
X |
|
|
|
|
|
db.score.count.filtered |
X |
X |
|
|
X |
X |
|
|
db.score.injected |
X |
X |
|
|
X |
X |
|
|
db.score.link.external |
X |
X |
|
|
X |
X |
|
|
db.score.link.internal |
X |
X |
|
|
X |
X |
|
|
db.signature.class |
X |
X |
|
|
X |
X |
|
|
db.signature.text_profile.min_token_len |
X |
X |
|
|
X |
X |
|
|
db.signature.text_profile.quant_rate |
X |
X |
|
|
X |
X |
|
|
db.update.additions.allowed |
X |
X |
|
|
X |
X |
|
|
db.update.max.inlinks |
X |
X |
|
|
X |
X |
|
|
db.update.purge.404 |
X |
X |
|
|
|
|
|
|
dc.language |
- |
X |
|
|
|
|
|
|
domain.statistics.mode |
- |
X |
X |
|
- |
X |
X |
|
elastic.index |
|
|
|
|
X |
- |
|
|
elastic.max.bulk.docs |
|
|
|
|
X |
- |
|
|
elastic.max.bulk.size |
|
|
|
|
X |
- |
|
|
encodingdetector.charset.min.confidence |
X |
- |
|
|
X |
- |
|
|
fail.on.job.failure |
|
|
|
|
- |
X |
|
|
fetcher.exit |
- |
X |
|
|
|
|
|
|
fetcher.follow.outlinks.depth |
X |
X |
|
|
|
|
|
|
fetcher.follow.outlinks.depth.divisor |
X |
X |
|
|
|
|
|
|
fetcher.follow.outlinks.ignore.external |
X |
X |
|
|
|
|
|
|
fetcher.follow.outlinks.num.links |
X |
X |
|
|
|
|
|
|
fetcher.job.resume |
|
|
|
|
- |
X |
|
|
fetcher.max.crawl.delay |
X |
X |
|
|
X |
X |
|
|
fetcher.max.exceptions.per.queue |
X |
X |
|
|
X |
- |
|
|
fetcher.parse |
X |
X |
(test) |
|
X |
X |
|
|
fetcher.queue.depth.multiplier |
X |
X |
|
|
X |
X |
|
|
fetcher.queue.mode |
X |
X |
|
|
X |
X |
|
|
fetcher.queue.use.host.settings |
|
|
|
|
X |
X |
|
|
fetcher.server.delay |
X |
X |
|
|
X |
X |
|
|
fetcher.server.min.delay |
X |
X |
|
|
X |
X |
|
|
fetcher.store.content |
X |
X |
|
|
X |
X |
|
|
fetcher.threads.fetch |
X |
X |
(owr.) |
|
X |
X |
|
|
fetcher.threads.per.host |
|
|
|
NUTCH-1409 |
|
|
|
NUTCH-1409 |
fetcher.threads.per.host.by.ip |
|
|
|
|
- |
X |
|
|
fetcher.threads.per.queue |
X |
X |
|
|
X |
X |
|
|
fetcher.threads.timeout.divisor |
X |
X |
|
|
|
|
|
|
fetcher.throughput.threshold.check.after |
X |
X |
(owr.) |
|
X |
X |
|
|
fetcher.throughput.threshold.pages |
X |
X |
|
|
X |
X |
|
|
fetcher.throughput.threshold.retries |
X |
X |
|
|
|
|
|
|
fetcher.throughput.threshold.sequence |
|
|
|
|
X |
X |
|
|
fetcher.timelimit |
- |
X |
X |
|
- |
X |
X |
|
fetcher.timelimit.mins |
X |
X |
|
|
X |
X |
|
|
fetcher.verbose |
X |
X |
|
|
X |
- |
|
|
file.content.ignored |
X |
- |
|
|
X |
- |
|
|
file.content.limit |
X |
X |
(test) |
|
X |
X |
(test) |
|
file.crawl.parent |
X |
X |
|
|
X |
X |
|
|
free.generator.filter |
- |
X |
|
|
|
|
|
|
free.generator.normalize |
- |
X |
|
|
|
|
|
|
ftp.content.limit |
X |
X |
|
|
X |
X |
|
|
ftp.follow.talk |
X |
X |
|
|
X |
X |
|
|
ftp.keep.connection |
X |
X |
|
|
X |
X |
|
|
ftp.password |
X |
X |
|
|
X |
X |
|
|
ftp.server.timeout |
X |
X |
|
|
X |
X |
|
|
ftp.timeout |
X |
X |
|
|
X |
X |
|
|
ftp.username |
X |
X |
|
|
X |
X |
|
|
generate.batch.id |
|
|
|
|
- |
X |
|
|
generate.count.mode |
X |
X |
|
|
X |
X |
|
|
generate.curTime |
- |
X |
|
|
- |
X |
|
|
generate.filter |
- |
X |
|
|
- |
X |
|
|
generate.max.count |
X |
X |
|
|
X |
X |
|
|
generate.max.distance |
|
|
|
|
X |
X |
|
|
generate.max.num.segments |
- |
X |
|
|
|
|
|
|
generate.max.per.host |
X |
X |
|
NUTCH-1409 |
|
|
|
NUTCH-1409 |
generate.max.per.host.by.ip |
- |
X |
|
NUTCH-1409 |
|
|
|
NUTCH-1409 |
generate.min.interval |
X |
X |
|
|
|
|
|
|
generate.min.score |
X |
X |
|
|
- |
X |
|
|
generate.normalise |
- |
X |
|
|
- |
X |
|
|
generate.partition.seed |
|
|
|
|
- |
X |
|
|
generate.restrict.status |
- |
X |
|
|
|
|
|
|
generate.topN |
- |
X |
|
|
- |
X |
|
|
generate.update.crawldb |
X |
X |
|
|
X |
X |
|
|
hostdb.concurrency.level |
|
|
|
|
- |
X |
|
|
hostdb.lru.size |
|
|
|
|
- |
X |
|
|
htmlparsefilter.order |
X |
X |
|
|
X |
X |
|
|
http.accept |
X |
X |
|
|
X |
X |
|
|
http.accept.language |
X |
X |
|
|
X |
X |
|
|
http.agent.description |
X |
X |
|
|
X |
X |
|
|
http.agent.email |
X |
X |
|
|
X |
X |
|
|
http.agent.host |
X |
X |
|
|
X |
X |
|
|
http.agent.name |
X |
X |
(test) |
|
X |
X |
(test) |
|
http.agent.url |
X |
X |
|
|
X |
X |
|
|
http.agent.version |
X |
X |
|
|
X |
X |
|
|
http.auth.file |
X |
X |
|
|
X |
X |
|
|
http.auth.verbose |
- |
X |
|
|
- |
X |
|
|
http.content.limit |
X |
X |
|
|
X |
X |
|
|
http.max.delays |
X |
- |
|
|
X |
- |
|
|
http.proxy.host |
X |
X |
(test) |
|
X |
X |
(test) |
|
http.proxy.password |
X |
X |
|
|
X |
X |
|
|
http.proxy.port |
X |
X |
(test) |
|
X |
X |
(test) |
|
http.proxy.realm |
X |
X |
|
|
X |
X |
|
|
http.proxy.username |
X |
X |
|
|
X |
X |
|
|
http.redirect.max |
X |
X |
|
|
|
|
|
|
http.robots.403.allow |
X |
X |
|
|
X |
X |
|
|
http.robots.agents |
X |
X |
(test) |
|
X |
X |
(test) |
|
http.timeout |
X |
X |
|
|
X |
X |
|
|
http.useHttp11 |
X |
X |
|
|
X |
X |
|
|
http.verbose |
X |
X |
|
|
X |
X |
|
|
index.content.md |
X |
X |
|
|
|
|
|
|
index.db.md |
X |
X |
|
|
|
|
|
|
index.parse.md |
X |
X |
(test) |
|
|
|
|
|
index.static |
X |
X |
|
|
|
|
|
|
indexer.add.domain |
X |
X |
|
|
|
|
|
|
indexer.delete |
- |
X |
|
|
|
|
|
|
indexer.delete.robots.noindex |
- |
X |
|
|
|
|
|
|
indexer.max.content.length |
X |
X |
|
|
|
|
|
|
indexer.max.title.length |
X |
X |
|
|
X |
X |
(test) |
|
indexer.score.power |
X |
X |
|
|
X |
X |
|
|
indexer.skip.notmodified |
X |
X |
|
|
|
|
|
|
indexer.url.filters |
- |
X |
X |
|
|
|
X |
|
indexer.url.normalizers |
- |
X |
|
|
|
|
|
|
indexer.writer.classes |
- |
X |
X |
|
- |
X |
X |
|
indexingfilter.order |
X |
X |
|
|
X |
X |
|
|
injector.current.time |
- |
X |
X |
|
- |
X |
X |
|
lang.analyze.max.length |
X |
X |
|
|
X |
- |
|
|
lang.extraction.policy |
X |
X |
|
|
X |
X |
|
|
lang.identification.only.certain |
X |
X |
|
|
X |
X |
|
|
lang.ngram.max.length |
|
|
|
|
X |
- |
|
|
lang.ngram.min.length |
|
|
|
|
X |
- |
|
|
link.analyze.damping.factor |
X |
X |
|
|
|
|
|
|
link.analyze.initial.score |
X |
X |
|
|
|
|
|
|
link.analyze.iteration |
- |
X |
X |
|
|
|
|
|
link.analyze.normalize.score |
- |
X |
|
|
- |
X |
|
|
link.analyze.num.iterations |
X |
X |
|
|
|
|
|
|
link.analyze.rank.one |
- |
X |
X |
|
|
|
|
|
link.delete.gone |
X |
X |
|
|
|
|
|
|
link.ignore.internal.domain |
X |
X |
|
|
|
|
|
|
link.ignore.internal.host |
X |
X |
|
|
|
|
|
|
link.ignore.limit.domain |
X |
X |
|
|
|
|
|
|
link.ignore.limit.page |
X |
X |
|
|
|
|
|
|
link.loops.depth |
X |
X |
|
|
|
|
|
|
link.score.updater.clear.score |
X |
X |
|
|
|
|
|
|
linkdb.url.filters |
- |
X |
X |
|
|
|
X |
|
linkdb.url.normalizer |
- |
X |
|
|
|
|
|
|
linkdb.url.normalizer.scope |
- |
X |
|
|
|
|
|
|
metatag.description |
- |
X |
|
|
|
|
|
|
metatag.keywords |
- |
X |
|
|
|
|
|
|
metatags.names |
X |
X |
(test) |
|
|
|
|
|
mime.type.magic |
X |
X |
|
|
X |
X |
|
|
mime.types.file |
X |
X |
|
|
X |
X |
|
|
moreIndexingFilter.indexMimeTypeParts |
X |
X |
(test) |
|
X |
X |
(test) |
|
moreIndexingFilter.mapMimeTypes |
X |
X |
|
|
|
|
|
|
nutch.conf.uuid |
- |
X |
|
|
- |
X |
|
|
parse.filter.urls |
X |
X |
(owr.) |
|
|
|
|
|
parse.job.force |
|
|
|
|
- |
X |
|
|
parse.job.resume |
|
|
|
|
- |
X |
|
|
parse.normalize.urls |
X |
X |
(owr.) |
|
|
|
|
|
parse.plugin.file |
X |
X |
(test) |
|
X |
X |
(test) |
|
parser.caching.forbidden.policy |
X |
X |
|
|
X |
X |
|
|
parser.character.encoding.default |
X |
X |
|
|
X |
X |
|
|
parser.fix.embeddedparams |
X |
- |
|
|
|
|
|
|
parser.html.form.use_action |
X |
X |
(test) |
|
X |
X |
(test) |
|
parser.html.impl |
X |
X |
|
|
X |
X |
|
|
parser.html.outlinks.ignore_tags |
X |
X |
|
|
X |
X |
|
|
parser.skip.truncated |
X |
X |
|
|
X |
X |
|
|
parser.timeout |
X |
X |
|
|
X |
X |
|
|
partition.url.mode |
X |
X |
|
|
X |
X |
|
|
partition.url.seed |
- |
X |
X |
|
- |
X |
|
|
plugin.auto-activation |
X |
X |
|
|
X |
X |
|
|
plugin.excludes |
X |
X |
|
|
X |
X |
|
|
plugin.folders |
X |
X |
|
|
X |
X |
|
|
plugin.includes |
X |
X |
(test) |
|
X |
X |
(test) |
|
schema.prefix |
|
|
|
|
|
|
X |
|
scoring.filter.order |
X |
X |
|
|
X |
X |
|
|
segment.dump.dir |
- |
X |
|
|
|
|
|
|
segment.merger.filter |
- |
X |
X |
|
|
|
|
|
segment.merger.normalizer |
- |
X |
X |
|
|
|
|
|
segment.merger.segmentName |
- |
X |
X |
|
|
|
|
|
segment.merger.slice |
- |
X |
X |
|
|
|
|
|
segment.proxy.port |
- |
X |
|
|
- |
X |
|
|
segment.reader.co |
- |
X |
X |
|
|
|
|
|
segment.reader.fe |
- |
X |
X |
|
|
|
|
|
segment.reader.ge |
- |
X |
X |
|
|
|
|
|
segment.reader.pa |
- |
X |
X |
|
|
|
|
|
segment.reader.pd |
- |
X |
X |
|
|
|
|
|
segment.reader.pt |
- |
X |
X |
|
|
|
|
|
sftp.password |
|
|
|
|
- |
X |
|
|
sftp.port |
|
|
|
|
- |
X |
|
|
sftp.server |
|
|
|
|
- |
X |
|
|
sftp.user |
|
|
|
|
- |
X |
|
|
solr.auth |
X |
X |
|
|
|
|
|
|
solr.auth.password |
- |
X |
|
|
|
|
|
|
solr.auth.username |
- |
X |
|
|
|
|
|
|
solr.commit.index |
X |
X |
|
|
X |
X |
|
|
solr.commit.size |
X |
X |
|
|
X |
X |
|
|
solr.mapping.file |
X |
X |
|
|
X |
X |
|
|
solr.params |
- |
X |
|
|
|
|
|
|
solr.server.url |
- |
X |
|
|
- |
X |
|
|
storage.crawl.id |
|
|
|
|
X |
X |
|
|
storage.data.store.class |
|
|
|
|
X |
X |
(test) |
|
storage.schema.host |
|
|
|
|
X |
X |
|
|
storage.schema.webpage |
|
|
|
|
X |
X |
|
|
subcollection.default.field |
X |
- |
|
|
|
|
|
|
subcollection.default.fieldname |
- |
X |
|
|
|
|
|
|
subcollections.config |
- |
X |
|
|
- |
X |
|
|
subcollections.xml |
- |
X |
|
|
- |
X |
|
|
tika.config.file |
- |
X |
|
|
|
|
|
|
urlfilter.automaton.file |
X |
X |
|
|
X |
X |
|
|
urlfilter.automaton.rules |
- |
X |
|
|
- |
X |
|
|
urlfilter.domain.file |
X |
X |
|
|
X |
X |
|
|
urlfilter.domain.rules |
- |
X |
|
|
- |
X |
|
|
urlfilter.domainblacklist.file |
- |
X |
|
|
|
|
|
|
urlfilter.domainblacklist.rules |
- |
X |
|
|
|
|
|
|
urlfilter.order |
X |
X |
|
|
X |
X |
|
|
urlfilter.prefix.file |
X |
X |
|
|
X |
X |
|
|
urlfilter.prefix.rules |
- |
X |
|
|
- |
X |
|
|
urlfilter.regex.file |
X |
X |
|
|
X |
X |
|
|
urlfilter.regex.rules |
- |
X |
|
|
- |
X |
|
|
urlfilter.suffix.file |
X |
X |
(test) |
|
X |
X |
(test) |
|
urlfilter.suffix.rules |
- |
X |
|
|
- |
X |
|
|
urlmeta.tags |
X |
X |
|
|
|
|
|
|
urlnormalizer.hosts.file |
- |
X |
|
|
|
|
|
|
urlnormalizer.hosts.rules |
- |
X |
|
|
|
|
|
|
urlnormalizer.loop.count |
X |
X |
|
|
X |
X |
|
|
urlnormalizer.order |
X |
X |
(test) |
|
X |
X |
(test) |
|
urlnormalizer.regex.file |
X |
X |
|
|
X |
X |
|
|
urlnormalizer.regex.rules |
- |
X |
|
|
- |
X |
|
|
webgraph.url.filters |
- |
X |
X |
|
|
|
X |
|
webgraph.url.normalizers |
- |
X |
|
|
|
|
|
|
webtable.dump.content |
|
|
|
|
- |
X |
|
|
webtable.dump.headers |
|
|
|
|
- |
X |
|
|
webtable.dump.links |
|
|
|
|
- |
X |
|
|
webtable.dump.text |
|
|
|
|
- |
X |
|
|
webtable.url.regex |
|
|
|
|
- |
X |
|
|
back to FrontPage