Skip to content

Advanced usage

SiteOne Crawler offers wide possibilities of use, which are described in the list of features or in the examples.

You can find out how to use the individual functions in the individual features sections, and also you have available the documentation of dozens of configuration parameters.

./crawler \
--url='https://crawler.siteone.io/?a=b&c=d' \
--device="desktop" \
--user-agent="CustomUserAgent/1.0" \
--timeout=10 \
--workers=5 \
--max-reqs-per-sec=15 \
--memory-limit=4096M \
--proxy="192.168.1.100:8080" \
--http-auth="user:secret123" \
--allowed-domain-for-external-files="*.example.com" \
--allowed-domain-for-external-files="fonts.google.com" \
--allowed-domain-for-crawling="*.example.org" \
--allowed-domain-for-crawling="*.example.*" \
--allowed-domain-for-crawling="*.other.com" \
--output="text" \
--extra-columns="DOM,X-Cache(10),Title(40>)" \
--url-column-size=100 \
--show-inline-criticals \
--show-inline-warnings \
--do-not-truncate-url \
--show-scheme-and-host \
--hide-progress-bar \
--no-color \
--disable-javascript \
--disable-styles \
--disable-fonts \
--disable-images \
--disable-files \
--remove-all-anchor-listeners \
--include-regex="/.*\/product\/.*/" \
--include-regex="/.*\/category\/.*/" \
--include-regex="/.*\/faq\/.*/" \
--ignore-regex="/.*\/secret\/.*/" \
--regex-filtering-only-for-pages \
--analyzer-filter-regex="/(Content|Accessibility)/i" \
--analyzer-filter-regex="/^(?:(?!Best|Access).)*$/i" \
--accept-encoding="gzip, deflate" \
--remove-query-params \
--add-random-query-params \
--max-queue-length=10000 \
--max-visited-urls=15000 \
--max-url-length=2500 \
--result-storage="file" \
--result-storage-dir="results" \
--result-storage-compression \
--http-cache-dir="cache" \
--http-cache-compression \
--websocket-server="0.0.0.0:8000" \
--output-html-report="tmp/report-mypage.html" \
--output-json-file="tmp/report-mypage.json" \
--output-text-file="tmp/report-mypage.txt" \
--add-host-to-output-file \
--add-timestamp-to-output-file \
--mail-to="your@email.com" \
--mail-from="crawler@email.com" \
--mail-from-name="SiteOne Crawler" \
--mail-subject-template="Crawler Report for %domain% (%date%)" \
--mail-smtp-host="smtp.example.com" \
--mail-smtp-port=25 \
--mail-smtp-user="smtpuser" \
--mail-smtp-pass="smtppass" \
--offline-export-dir="tmp/example.com/" \
--offline-export-store-only-url-regex=".*offline.*" \
--sitemap-xml-file="tmp/sitemap.xml" \
--sitemap-txt-file="tmp/sitemap.txt" \
--sitemap-base-priority=0.6 \
--sitemap-priority-increase=0.2 \
--fastest-urls-top-limit=25 \
--fastest-urls-max-time=2 \
--max-heading-level=4 \
--slowest-urls-top-limit=25 \
--slowest-urls-min-time=0.02 \
--slowest-urls-max-time=5 \
--debug \
--debug-log-file="debug.log" \
--debug-url-regex="/.*contact.*/"1