You are viewing an old version of this page. View the current version.

Compare with Current View Page History

« Previous Version 55 Current »

List of Nutch Properties

This list of Nutch configuration properties is intended for development. It includes deprecated properties and properties used only "internally". The list is generated from nutch-default.xml and Java sources.

Legend:
Def: defined in nutch-default.xml

Used: read or set from Java code

Temp: temporarily used to pass settings (eg, from command-line arguments) to map or reduce jobs

Depr.: deprecated

(owr.): some properties are defined in nutch-default.xml (and may be set in nutch-site.xml) but are overwritten by a command-line argument (tests and benchmarks are excluded)

(test): overwritten only in tests and benchmarks


1.X (master Branch)

2.x (deprecated codebase)

Property

Def.

Used

Temp.

Depr.

Def.

Used

Temp.

Depr.

CrawlDBScanner.regex


X

X






CrawlDBScanner.status


X

X






anchorIndexingFilter.deduplicate

X

X

(test)


X

X

(test)


arc.url.version


X







content.server.port


X




X



crawl.gen.delay

X

X



X

X



crawldb.url.filters


X

X




X


crawldb.url.normalizers


X







crawldb.url.normalizers.scope


X







creativecommons.exclude.unlicensed


X




X



db.default.fetch.interval

X

X


NUTCH-1409

X

X


NUTCH-1409

db.fetch.interval.default

X

X

(test)


X

X



db.fetch.interval.max

X

X



X

X



db.fetch.retry.max

X

X



X

X



db.fetch.schedule.adaptive.dec_rate

X

X



X

X



db.fetch.schedule.adaptive.inc_rate

X

X



X

X



db.fetch.schedule.adaptive.max_interval

X

X



X

X



db.fetch.schedule.adaptive.min_interval

X

X



X

X



db.fetch.schedule.adaptive.sync_delta

X

X



X

X



db.fetch.schedule.adaptive.sync_delta_rate

X

X



X

X



db.fetch.schedule.class

X

X

(test)


X

X



db.fetch.schedule.mime.file

X

X







db.ignore.external.links

X

X



X

X



db.ignore.internal.links

X

X



X




db.injector.overwrite

X

X







db.injector.update

X

X

(test)






db.max.anchor.length

X

X



X




db.max.fetch.interval


X


NUTCH-1409


X


NUTCH-1409

db.max.inlinks

X

X



X




db.max.outlinks.per.page

X

X



X

X



db.parsemeta.to.crawldb

X

X



X




db.preserve.backup

X

X







db.reader.stats.sort


X

X



X

X


db.reader.topn


X

X






db.reader.topn.min


X

X






db.score.count.filtered

X

X



X

X



db.score.injected

X

X



X

X



db.score.link.external

X

X



X

X



db.score.link.internal

X

X



X

X



db.signature.class

X

X



X

X



db.signature.text_profile.min_token_len

X

X



X

X



db.signature.text_profile.quant_rate

X

X



X

X



db.update.additions.allowed

X

X



X

X



db.update.max.inlinks

X

X



X

X



db.update.purge.404

X

X







dc.language


X







domain.statistics.mode


X

X



X

X


elastic.index





X




elastic.max.bulk.docs





X




elastic.max.bulk.size





X




encodingdetector.charset.min.confidence

X




X




fail.on.job.failure






X



fetcher.exit


X







fetcher.follow.outlinks.depth

X

X







fetcher.follow.outlinks.depth.divisor

X

X







fetcher.follow.outlinks.ignore.external

X

X







fetcher.follow.outlinks.num.links

X

X







fetcher.job.resume






X



fetcher.max.crawl.delay

X

X



X

X



fetcher.max.exceptions.per.queue

X

X



X




fetcher.parse

X

X

(test)


X

X



fetcher.queue.depth.multiplier

X

X



X

X



fetcher.queue.mode

X

X



X

X



fetcher.queue.use.host.settings





X

X



fetcher.server.delay

X

X



X

X



fetcher.server.min.delay

X

X



X

X



fetcher.store.content

X

X



X

X



fetcher.threads.fetch

X

X

(owr.)


X

X



fetcher.threads.per.host




NUTCH-1409




NUTCH-1409

fetcher.threads.per.host.by.ip






X



fetcher.threads.per.queue

X

X



X

X



fetcher.threads.timeout.divisor

X

X







fetcher.throughput.threshold.check.after

X

X

(owr.)


X

X



fetcher.throughput.threshold.pages

X

X



X

X



fetcher.throughput.threshold.retries

X

X







fetcher.throughput.threshold.sequence





X

X



fetcher.timelimit


X

X



X

X


fetcher.timelimit.mins

X

X



X

X



fetcher.verbose

X

X



X




file.content.ignored

X




X




file.content.limit

X

X

(test)


X

X

(test)


file.crawl.parent

X

X



X

X



free.generator.filter


X







free.generator.normalize


X







ftp.content.limit

X

X



X

X



ftp.follow.talk

X

X



X

X



ftp.keep.connection

X

X



X

X



ftp.password

X

X



X

X



ftp.server.timeout

X

X



X

X



ftp.timeout

X

X



X

X



ftp.username

X

X



X

X



generate.batch.id






X



generate.count.mode

X

X



X

X



generate.curTime


X




X



generate.filter


X




X



generate.max.count

X

X



X

X



generate.max.distance





X

X



generate.max.num.segments


X







generate.max.per.host

X

X


NUTCH-1409




NUTCH-1409

generate.max.per.host.by.ip


X


NUTCH-1409




NUTCH-1409

generate.min.interval

X

X







generate.min.score

X

X




X



generate.normalise


X




X



generate.partition.seed






X



generate.restrict.status


X







generate.topN


X




X



generate.update.crawldb

X

X



X

X



hostdb.concurrency.level






X



hostdb.lru.size






X



htmlparsefilter.order

X

X



X

X



http.accept

X

X



X

X



http.accept.language

X

X



X

X



http.agent.description

X

X



X

X



http.agent.email

X

X



X

X



http.agent.host

X

X



X

X



http.agent.name

X

X

(test)


X

X

(test)


http.agent.url

X

X



X

X



http.agent.version

X

X



X

X



http.auth.file

X

X



X

X



http.auth.verbose


X




X



http.content.limit

X

X



X

X



http.max.delays

X




X




http.proxy.host

X

X

(test)


X

X

(test)


http.proxy.password

X

X



X

X



http.proxy.port

X

X

(test)


X

X

(test)


http.proxy.realm

X

X



X

X



http.proxy.username

X

X



X

X



http.redirect.max

X

X







http.robots.403.allow

X

X



X

X



http.robots.agents

X

X

(test)


X

X

(test)


http.timeout

X

X



X

X



http.useHttp11

X

X



X

X



http.verbose

X

X



X

X



index.content.md

X

X







index.db.md

X

X







index.parse.md

X

X

(test)






index.replace.regexp

X

X







index.static

X

X







indexer.add.domain

X

X







indexer.delete


X







indexer.delete.robots.noindex


X







indexer.max.content.length

X

X







indexer.max.title.length

X

X



X

X

(test)


indexer.score.power

X

X



X

X



indexer.skip.notmodified

X

X







indexer.url.filters


X

X




X


indexer.url.normalizers


X







indexer.writer.classes


X

X



X

X


indexingfilter.order

X

X



X

X



injector.current.time


X

X



X

X


lang.analyze.max.length

X

X



X




lang.extraction.policy

X

X



X

X



lang.identification.only.certain

X

X



X

X



lang.ngram.max.length





X




lang.ngram.min.length





X




link.analyze.damping.factor

X

X







link.analyze.initial.score

X

X







link.analyze.iteration


X

X






link.analyze.normalize.score


X




X



link.analyze.num.iterations

X

X







link.analyze.rank.one


X

X






link.delete.gone

X

X







link.ignore.internal.domain

X

X







link.ignore.internal.host

X

X







link.ignore.limit.domain

X

X







link.ignore.limit.page

X

X







link.loops.depth

X

X







link.score.updater.clear.score

X

X







linkdb.url.filters


X

X




X


linkdb.url.normalizer


X







linkdb.url.normalizer.scope


X







metatag.description


X







metatag.keywords


X







metatags.names

X

X

(test)






mime.type.magic

X

X



X

X



mime.types.file

X

X



X

X



moreIndexingFilter.indexMimeTypeParts

X

X

(test)


X

X

(test)


moreIndexingFilter.mapMimeTypes

X

X







nutch.conf.uuid


X




X



parse.filter.urls

X

X

(owr.)






parse.job.force






X



parse.job.resume






X



parse.normalize.urls

X

X

(owr.)






parse.plugin.file

X

X

(test)


X

X

(test)


parser.caching.forbidden.policy

X

X



X

X



parser.character.encoding.default

X

X



X

X



parser.fix.embeddedparams

X








parser.html.form.use_action

X

X

(test)


X

X

(test)


parser.html.impl

X

X



X

X



parser.html.outlinks.ignore_tags

X

X



X

X



parser.skip.truncated

X

X



X

X



parser.timeout

X

X



X

X



partition.url.mode

X

X



X

X



partition.url.seed


X

X



X



plugin.auto-activation

X

X



X

X



plugin.excludes

X

X



X

X



plugin.folders

X

X



X

X



plugin.includes

X

X

(test)


X

X

(test)


schema.prefix







X


scoring.filter.order

X

X



X

X



segment.dump.dir


X







segment.merger.filter


X

X






segment.merger.normalizer


X

X






segment.merger.segmentName


X

X






segment.merger.slice


X

X






segment.proxy.port


X




X



segment.reader.co


X

X






segment.reader.fe


X

X






segment.reader.ge


X

X






segment.reader.pa


X

X






segment.reader.pd


X

X






segment.reader.pt


X

X






sftp.password






X



sftp.port






X



sftp.server






X



sftp.user






X



solr.auth

X

X







solr.auth.password


X







solr.auth.username


X







solr.commit.index

X

X



X

X



solr.commit.size

X

X



X

X



solr.mapping.file

X

X



X

X



solr.params


X







solr.server.url


X




X



storage.crawl.id





X

X



storage.data.store.class





X

X

(test)


storage.schema.host





X

X



storage.schema.webpage





X

X



subcollection.default.field

X








subcollection.default.fieldname


X







subcollections.config


X




X



subcollections.xml


X




X



tika.config.file


X







urlfilter.automaton.file

X

X



X

X



urlfilter.automaton.rules


X




X



urlfilter.domain.file

X

X



X

X



urlfilter.domain.rules


X




X



urlfilter.domainblacklist.file


X







urlfilter.domainblacklist.rules


X







urlfilter.order

X

X



X

X



urlfilter.prefix.file

X

X



X

X



urlfilter.prefix.rules


X




X



urlfilter.regex.file

X

X



X

X



urlfilter.regex.rules


X




X



urlfilter.suffix.file

X

X

(test)


X

X

(test)


urlfilter.suffix.rules


X




X



urlmeta.tags

X

X







urlnormalizer.hosts.file


X







urlnormalizer.hosts.rules


X







urlnormalizer.loop.count

X

X



X

X



urlnormalizer.order

X

X

(test)


X

X

(test)


urlnormalizer.regex.file

X

X



X

X



urlnormalizer.regex.rules


X




X



webgraph.url.filters


X

X




X


webgraph.url.normalizers


X







webtable.dump.content






X



webtable.dump.headers






X



webtable.dump.links






X



webtable.dump.text






X



webtable.url.regex






X



back to FrontPage

  • No labels