2
PRODUCTS = [
{
“sku”: “CRW-101”,
“name”: “Crawler Reliability Kit”,
“category”: “automation”,
“price”: 149.0,
“rating”: 4.8,
“stock”: 18,
“features”: [“retry policy”, “queue replay”, “structured logs”],
“related”: [“CRW-202”, “CRW-303”],
},
{
“sku”: “CRW-202”,
“name”: “Playwright Rendering Pack”,
“category”: “browser”,
“price”: 249.0,
“rating”: 4.7,
“stock”: 9,
“features”: [“headless chromium”, “screenshots”, “dynamic DOM extraction”],
“related”: [“CRW-101”, “CRW-404”],
},
{
“sku”: “CRW-303”,
“name”: “RAG Extraction Bundle”,
“category”: “ai-data”,
“price”: 199.0,
“rating”: 4.9,
“stock”: 13,
“features”: [“clean text chunks”, “metadata capture”, “JSONL export”],
“related”: [“CRW-101”, “CRW-505”],
},
{
“sku”: “CRW-404”,
“name”: “Anti-Fragile Session Toolkit”,
“category”: “resilience”,
“price”: 299.0,
“rating”: 4.6,
“stock”: 5,
“features”: [“session rotation”, “state recovery”, “graceful failures”],
“related”: [“CRW-202”, “CRW-505”],
},
{
“sku”: “CRW-505”,
“name”: “Data Export Control Plane”,
“category”: “storage”,
“price”: 179.0,
“rating”: 4.5,
“stock”: 21,
“features”: [“datasets”, “key-value store”, “CSV and JSON export”],
“related”: [“CRW-303”, “CRW-404″],
},
]
def layout(title, body, extra_head=””, extra_script=””):
css = “””
<style>
body {
font-family: Inter, system-ui, -apple-system, BlinkMacSystemFont, “Segoe UI”, sans-serif;
margin: 0;
background: #f7f7fb;
color: #1f2430;
}
header {
background: #202638;
color: white;
padding: 28px 40px;
}
nav a {
color: #dbe7ff;
margin-right: 18px;
text-decoration: none;
font-weight: 600;
}
main {
max-width: 1050px;
margin: 0 auto;
padding: 32px;
}
.grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(230px, 1fr));
gap: 18px;
}
.card, article, .panel {
background: white;
border: 1px solid #e5e7ef;
border-radius: 16px;
padding: 20px;
box-shadow: 0 8px 25px rgba(20, 30, 60, 0.05);
}
.price {
font-size: 1.3rem;
font-weight: 800;
}
.tag {
display: inline-block;
background: #edf2ff;
border: 1px solid #d6e0ff;
border-radius: 999px;
padding: 4px 10px;
margin: 3px;
font-size: 0.82rem;
}
.stock-low {
color: #b42318;
font-weight: 700;
}
.stock-ok {
color: #067647;
font-weight: 700;
}
code, pre {
background: #111827;
color: #d1fae5;
border-radius: 10px;
}
pre {
padding: 16px;
overflow-x: auto;
}
footer {
padding: 30px 40px;
color: #606779;
}
</style>
“””
return f”””
<!doctype html>
<html lang=”en”>
<head>
<meta charset=”utf-8″>
<meta name=”viewport” content=”width=device-width, initial-scale=1″>
<meta name=”description” content=”{title} page for a Crawlee Python tutorial demo website.”>
<title>{title}</title>
{css}
{extra_head}
</head>
<body>
<header>
<h1>{title}</h1>
<nav>
<a href=”https://www.marktechpost.com/index.html”>Home</a>
<a href=”https://www.marktechpost.com/products/product-crw-101.html”>Products</a>
<a href=”https://www.marktechpost.com/docs/getting-started.html”>Docs</a>
<a href=”https://www.marktechpost.com/blog/crawling-at-scale.html”>Blog</a>
<a href=”https://www.marktechpost.com/dynamic.html”>Dynamic JS Page</a>
<a href=”https://www.marktechpost.com/admin/hidden.html”>Admin</a>
</nav>
</header>
<main>{body}</main>
<footer>Local demo website generated for Crawlee Python advanced tutorial.</footer>
{extra_script}
</body>
</html>
“””
def build_demo_site():
write_file(
SITE_DIR / “robots.txt”,
“””
User-agent: *
Disallow: /admin/
Allow: /
“””,
)
product_cards = []
for product in PRODUCTS:
product_cards.append(
f”””
<div class=”card product-teaser” data-sku=”{product[‘sku’]}” data-category=”{product[‘category’]}”>
<h2><a href=”https://www.marktechpost.com/products/product-{safe_slug(product[“sku’])}.html”>{product[‘name’]}</a></h2>
<p>{product[‘category’]} crawler module with rating {product[‘rating’]}.</p>
<p class=”price” data-price=”{product[‘price’]}”>${product[‘price’]:.2f}</p>
<p class=”{‘stock-low’ if product[‘stock’] < 10 else ‘stock-ok’}”>Stock: {product[‘stock’]}</p>
</div>
“””
)
write_file(
SITE_DIR / “index.html”,
layout(
“Crawlee Demo Commerce + Docs Hub”,
f”””
<section class=”panel”>
<h2>Why this site exists</h2>
<p>
This local website gives us predictable pages for testing Crawlee without scraping a third-party website.
We include static HTML pages, documentation pages, product detail pages, a blog article, robots.txt,
and a JavaScript-rendered page.
</p>
</section>
<h2>Featured crawler modules</h2>
<section class=”grid”>
{”.join(product_cards)}
</section>
<section class=”panel”>
<h2>Internal links for recursive crawling</h2>
<ul>
<li><a href=”https://www.marktechpost.com/docs/getting-started.html”>Getting started guide</a></li>
<li><a href=”https://www.marktechpost.com/docs/advanced-routing.html”>Advanced routing guide</a></li>
<li><a href=”https://www.marktechpost.com/blog/crawling-at-scale.html”>Crawling at scale article</a></li>
<li><a href=”https://www.marktechpost.com/dynamic.html”>JavaScript-rendered catalog</a></li>
<li><a href=”https://www.marktechpost.com/admin/hidden.html”>Admin page blocked by robots and crawler filters</a></li>
</ul>
</section>
“””,
),
)
for product in PRODUCTS:
related_links = “\n”.join(
f'<li><a class=”related-link” href=”https://www.marktechpost.com/products/product-{safe_slug(sku)}.html”>{sku}</a></li>’
for sku in product[“related”]
)
feature_list = “\n”.join(f”<li>{feature}</li>” for feature in product[“features”])
json_ld = json.dumps(
{
“@context”: “https://schema.org”,
“@type”: “Product”,
“sku”: product[“sku”],
“name”: product[“name”],
“category”: product[“category”],
“offers”: {
“@type”: “Offer”,
“price”: product[“price”],
“priceCurrency”: “USD”,
},
“aggregateRating”: {
“@type”: “AggregateRating”,
“ratingValue”: product[“rating”],
},
},
indent=2,
)
write_file(
SITE_DIR / “products” / f”product-{safe_slug(product[‘sku’])}.html”,
layout(
f”{product[‘name’]} | Product Detail”,
f”””
<article class=”product”
data-sku=”{product[‘sku’]}”
data-category=”{product[‘category’]}”
data-rating=”{product[‘rating’]}”
data-stock=”{product[‘stock’]}”>
<h2 class=”product-title”>{product[‘name’]}</h2>
<p class=”sku”>SKU: <strong>{product[‘sku’]}</strong></p>
<p class=”category”>Category: <strong>{product[‘category’]}</strong></p>
<p class=”price” data-price=”{product[‘price’]}”>${product[‘price’]:.2f}</p>
<p class=”rating”>Rating: {product[‘rating’]} / 5</p>
<p class=”{‘stock-low’ if product[‘stock’] < 10 else ‘stock-ok’}”>Stock: {product[‘stock’]}</p>
<h3>Features</h3>
<ul class=”features”>{feature_list}</ul>
<h3>Related modules</h3>
<ul>{related_links}</ul>
</article>
<script type=”application/ld+json”>{json_ld}</script>
“””,
),
)
{
“sku”: “CRW-101”,
“name”: “Crawler Reliability Kit”,
“category”: “automation”,
“price”: 149.0,
“rating”: 4.8,
“stock”: 18,
“features”: [“retry policy”, “queue replay”, “structured logs”],
“related”: [“CRW-202”, “CRW-303”],
},
{
“sku”: “CRW-202”,
“name”: “Playwright Rendering Pack”,
“category”: “browser”,
“price”: 249.0,
“rating”: 4.7,
“stock”: 9,
“features”: [“headless chromium”, “screenshots”, “dynamic DOM extraction”],
“related”: [“CRW-101”, “CRW-404”],
},
{
“sku”: “CRW-303”,
“name”: “RAG Extraction Bundle”,
“category”: “ai-data”,
“price”: 199.0,
“rating”: 4.9,
“stock”: 13,
“features”: [“clean text chunks”, “metadata capture”, “JSONL export”],
“related”: [“CRW-101”, “CRW-505”],
},
{
“sku”: “CRW-404”,
“name”: “Anti-Fragile Session Toolkit”,
“category”: “resilience”,
“price”: 299.0,
“rating”: 4.6,
“stock”: 5,
“features”: [“session rotation”, “state recovery”, “graceful failures”],
“related”: [“CRW-202”, “CRW-505”],
},
{
“sku”: “CRW-505”,
“name”: “Data Export Control Plane”,
“category”: “storage”,
“price”: 179.0,
“rating”: 4.5,
“stock”: 21,
“features”: [“datasets”, “key-value store”, “CSV and JSON export”],
“related”: [“CRW-303”, “CRW-404″],
},
]
def layout(title, body, extra_head=””, extra_script=””):
css = “””
<style>
body {
font-family: Inter, system-ui, -apple-system, BlinkMacSystemFont, “Segoe UI”, sans-serif;
margin: 0;
background: #f7f7fb;
color: #1f2430;
}
header {
background: #202638;
color: white;
padding: 28px 40px;
}
nav a {
color: #dbe7ff;
margin-right: 18px;
text-decoration: none;
font-weight: 600;
}
main {
max-width: 1050px;
margin: 0 auto;
padding: 32px;
}
.grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(230px, 1fr));
gap: 18px;
}
.card, article, .panel {
background: white;
border: 1px solid #e5e7ef;
border-radius: 16px;
padding: 20px;
box-shadow: 0 8px 25px rgba(20, 30, 60, 0.05);
}
.price {
font-size: 1.3rem;
font-weight: 800;
}
.tag {
display: inline-block;
background: #edf2ff;
border: 1px solid #d6e0ff;
border-radius: 999px;
padding: 4px 10px;
margin: 3px;
font-size: 0.82rem;
}
.stock-low {
color: #b42318;
font-weight: 700;
}
.stock-ok {
color: #067647;
font-weight: 700;
}
code, pre {
background: #111827;
color: #d1fae5;
border-radius: 10px;
}
pre {
padding: 16px;
overflow-x: auto;
}
footer {
padding: 30px 40px;
color: #606779;
}
</style>
“””
return f”””
<!doctype html>
<html lang=”en”>
<head>
<meta charset=”utf-8″>
<meta name=”viewport” content=”width=device-width, initial-scale=1″>
<meta name=”description” content=”{title} page for a Crawlee Python tutorial demo website.”>
<title>{title}</title>
{css}
{extra_head}
</head>
<body>
<header>
<h1>{title}</h1>
<nav>
<a href=”https://www.marktechpost.com/index.html”>Home</a>
<a href=”https://www.marktechpost.com/products/product-crw-101.html”>Products</a>
<a href=”https://www.marktechpost.com/docs/getting-started.html”>Docs</a>
<a href=”https://www.marktechpost.com/blog/crawling-at-scale.html”>Blog</a>
<a href=”https://www.marktechpost.com/dynamic.html”>Dynamic JS Page</a>
<a href=”https://www.marktechpost.com/admin/hidden.html”>Admin</a>
</nav>
</header>
<main>{body}</main>
<footer>Local demo website generated for Crawlee Python advanced tutorial.</footer>
{extra_script}
</body>
</html>
“””
def build_demo_site():
write_file(
SITE_DIR / “robots.txt”,
“””
User-agent: *
Disallow: /admin/
Allow: /
“””,
)
product_cards = []
for product in PRODUCTS:
product_cards.append(
f”””
<div class=”card product-teaser” data-sku=”{product[‘sku’]}” data-category=”{product[‘category’]}”>
<h2><a href=”https://www.marktechpost.com/products/product-{safe_slug(product[“sku’])}.html”>{product[‘name’]}</a></h2>
<p>{product[‘category’]} crawler module with rating {product[‘rating’]}.</p>
<p class=”price” data-price=”{product[‘price’]}”>${product[‘price’]:.2f}</p>
<p class=”{‘stock-low’ if product[‘stock’] < 10 else ‘stock-ok’}”>Stock: {product[‘stock’]}</p>
</div>
“””
)
write_file(
SITE_DIR / “index.html”,
layout(
“Crawlee Demo Commerce + Docs Hub”,
f”””
<section class=”panel”>
<h2>Why this site exists</h2>
<p>
This local website gives us predictable pages for testing Crawlee without scraping a third-party website.
We include static HTML pages, documentation pages, product detail pages, a blog article, robots.txt,
and a JavaScript-rendered page.
</p>
</section>
<h2>Featured crawler modules</h2>
<section class=”grid”>
{”.join(product_cards)}
</section>
<section class=”panel”>
<h2>Internal links for recursive crawling</h2>
<ul>
<li><a href=”https://www.marktechpost.com/docs/getting-started.html”>Getting started guide</a></li>
<li><a href=”https://www.marktechpost.com/docs/advanced-routing.html”>Advanced routing guide</a></li>
<li><a href=”https://www.marktechpost.com/blog/crawling-at-scale.html”>Crawling at scale article</a></li>
<li><a href=”https://www.marktechpost.com/dynamic.html”>JavaScript-rendered catalog</a></li>
<li><a href=”https://www.marktechpost.com/admin/hidden.html”>Admin page blocked by robots and crawler filters</a></li>
</ul>
</section>
“””,
),
)
for product in PRODUCTS:
related_links = “\n”.join(
f'<li><a class=”related-link” href=”https://www.marktechpost.com/products/product-{safe_slug(sku)}.html”>{sku}</a></li>’
for sku in product[“related”]
)
feature_list = “\n”.join(f”<li>{feature}</li>” for feature in product[“features”])
json_ld = json.dumps(
{
“@context”: “https://schema.org”,
“@type”: “Product”,
“sku”: product[“sku”],
“name”: product[“name”],
“category”: product[“category”],
“offers”: {
“@type”: “Offer”,
“price”: product[“price”],
“priceCurrency”: “USD”,
},
“aggregateRating”: {
“@type”: “AggregateRating”,
“ratingValue”: product[“rating”],
},
},
indent=2,
)
write_file(
SITE_DIR / “products” / f”product-{safe_slug(product[‘sku’])}.html”,
layout(
f”{product[‘name’]} | Product Detail”,
f”””
<article class=”product”
data-sku=”{product[‘sku’]}”
data-category=”{product[‘category’]}”
data-rating=”{product[‘rating’]}”
data-stock=”{product[‘stock’]}”>
<h2 class=”product-title”>{product[‘name’]}</h2>
<p class=”sku”>SKU: <strong>{product[‘sku’]}</strong></p>
<p class=”category”>Category: <strong>{product[‘category’]}</strong></p>
<p class=”price” data-price=”{product[‘price’]}”>${product[‘price’]:.2f}</p>
<p class=”rating”>Rating: {product[‘rating’]} / 5</p>
<p class=”{‘stock-low’ if product[‘stock’] < 10 else ‘stock-ok’}”>Stock: {product[‘stock’]}</p>
<h3>Features</h3>
<ul class=”features”>{feature_list}</ul>
<h3>Related modules</h3>
<ul>{related_links}</ul>
</article>
<script type=”application/ld+json”>{json_ld}</script>
“””,
),
)


