From 9586a6aa231c679b00bb527087e9ccfa7bf0f49c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Ad=C3=A1mek?= Date: Mon, 30 Mar 2026 16:22:58 +0200 Subject: [PATCH] docs: serve markdown via Accept header in nginx (#3542) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary - Nginx now serves markdown when `Accept: text/markdown` or `text/plain` is in the request header (appends `.md` to proxied path), for both JS and Python docs - Root paths (`/`, `/python`) redirect to `llms.txt` when markdown is requested - Adds open redirect protection for trailing slash redirects - CI assertions verify Content-Type headers for both doc sources Ported from apify/apify-docs nginx config. ๐Ÿค– Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.6 (1M context) --- .github/workflows/test-ci.yml | 97 +++++++++++++++++++++++++++++++++++ website/nginx.conf | 77 ++++++++++++++++++++++++++- 2 files changed, 172 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-ci.yml b/.github/workflows/test-ci.yml index 3646de004ad4..3779cea29622 100644 --- a/.github/workflows/test-ci.yml +++ b/.github/workflows/test-ci.yml @@ -133,6 +133,103 @@ jobs: APIFY_SIGNING_TOKEN: ${{ secrets.APIFY_SIGNING_TOKEN }} SEGMENT_TOKEN: ${{ secrets.SEGMENT_TOKEN }} + - name: Install Nginx + run: | + sudo apt-get update + sudo apt-get install -y nginx + + - name: Start Docusaurus server + run: | + cd website + nohup yarn docusaurus serve --port 3000 --no-open & + sleep 5 + curl -f http://localhost:3000 > /dev/null + + - name: Start Nginx with project config + run: | + PWD_PATH="$(pwd)" + cat > default.conf </dev/null) + status=$(echo "$response" | tail -1) + location=$(echo "$response" | grep -i "^location:" | tr -d '\r' || true) + echo "โ†’ $url โ†’ HTTP $status ${location:+(${location})}" + if [ "$status" = "301" ] || [ "$status" = "302" ]; then + echo "โŒ Got redirect for $url: $location" && exit 1 + fi + } + + echo "๐Ÿงช Checking open redirect protection..." + assert_no_redirect "http://localhost:8080///%5Cevil.com/" + assert_no_redirect "http://localhost:8080/%5Cevil.com/" + assert_no_redirect "http://localhost:8080///%5cevil.com/" + assert_no_redirect "http://localhost:8080" --request-target '/\evil.com/' + assert_no_redirect "http://localhost:8080" --request-target '///\evil.com/' + assert_status "http://localhost:8080/js/docs/quick-start/" "302" + + echo "๐Ÿงช Checking Nginx responses... (crawlee JS)" + assert_header "http://localhost:8080/" "Content-Type" "text/html" + assert_header "http://localhost:8080/" "Content-Type" "text/markdown" -H "Accept: text/markdown" + assert_header "http://localhost:8080/js/docs/quick-start" "Content-Type" "text/html" + assert_header "http://localhost:8080/js/docs/quick-start.md" "Content-Type" "text/markdown" + assert_header "http://localhost:8080/js/docs/quick-start" "Content-Type" "text/markdown" -H "Accept: text/markdown" + assert_header "http://localhost:8080/llms.txt" "Content-Type" "text/markdown" + assert_header "http://localhost:8080/llms-full.txt" "Content-Type" "text/markdown" + + echo "๐Ÿงช Checking Nginx responses... (crawlee Python)" + assert_header "http://localhost:8080/python/docs/quick-start" "Content-Type" "text/html" + assert_header "http://localhost:8080/python/docs/quick-start.md" "Content-Type" "text/markdown" + assert_header "http://localhost:8080/python/docs/quick-start" "Content-Type" "text/markdown" -H "Accept: text/markdown" + assert_header "http://localhost:8080/python/llms.txt" "Content-Type" "text/markdown" + assert_header "http://localhost:8080/python/llms-full.txt" "Content-Type" "text/markdown" + + echo "โœ… All Nginx header checks passed." + + - name: Stop Nginx + if: always() + run: nginx -c "$(pwd)/default.conf" -s stop + lint: name: Lint runs-on: ubuntu-22.04 diff --git a/website/nginx.conf b/website/nginx.conf index 1739070e5012..9081661acf1b 100644 --- a/website/nginx.conf +++ b/website/nginx.conf @@ -1,9 +1,24 @@ +map $http_accept $serve_markdown { + default 0; + ~*text/plain 1; + ~*text/markdown 1; +} + +map $request_uri $has_no_extension { + ~^[^.]*$ 1; + default 0; +} + # Nginx reverse proxy configuration for crawlee.dev # Routes to GitHub Pages and handles legacy URL redirects server { listen 0.0.0.0:8080; server_name 'crawlee.dev'; + # comment out the resolver and use localhost:3000 for local development + set $backend "https://apify.github.io/crawlee"; + resolver 1.1.1.1 8.8.8.8 valid=30s ipv6=off; + # Health check endpoint location /health { access_log off; @@ -11,13 +26,71 @@ server { add_header Content-Type application/json; } + location = / { + if ($serve_markdown) { + rewrite ^ /llms.txt last; + } + proxy_pass $backend/; + } + + location ~ ^/(llms|llms-full)\.txt$ { + proxy_hide_header Content-Type; + add_header Content-Type 'text/markdown; charset=utf-8' always; + proxy_pass $backend$uri; + } + + # remove trailing slashes from all URLs (except root /) + # exact match locations (e.g., location = /python/) take priority over this regex + # Only match URIs composed of safe characters (letters, digits, dots, hyphens, + # underscores, forward slashes). This prevents open redirect via %5C (backslash): + # nginx decodes %5C to \ in $uri, and \ in the Location header gets normalized + # to / by browsers, turning /\evil.com into //evil.com (protocol-relative URL). + location ~ ^(/[a-zA-Z0-9][a-zA-Z0-9_./-]*)/$ { + rewrite ^(.+)/$ $1$is_args$args? redirect; + } + location / { - proxy_pass https://apify.github.io/crawlee/; + set $rewrite_condition "$serve_markdown$has_no_extension"; + set $proxy_path $request_uri; + + if ($rewrite_condition = "11") { + set $proxy_path "${request_uri}.md"; + } + proxy_pass $backend$proxy_path; + } + + ### Repository path: "/python" + + location = /python { + if ($serve_markdown) { + rewrite ^ /python/llms.txt last; + } + proxy_pass https://apify.github.io/crawlee-python/; } - location /python { + + location = /python/ { + if ($serve_markdown) { + rewrite ^ /python/llms.txt last; + } proxy_pass https://apify.github.io/crawlee-python/; } + location ~ ^/python/(llms|llms-full)\.txt$ { + proxy_hide_header Content-Type; + add_header Content-Type 'text/markdown; charset=utf-8' always; + proxy_pass https://apify.github.io/crawlee-python/$1.txt; + } + + location ~ ^/python/(.*)$ { + set $path_suffix $1; + set $proxy_path "/$path_suffix"; + set $rewrite_condition "$serve_markdown$has_no_extension"; + if ($rewrite_condition = "11") { + set $proxy_path "${proxy_path}.md"; + } + proxy_pass https://apify.github.io/crawlee-python$proxy_path; + } + # So that we can have both GH pages and crawlee.dev/python working and loading assets from the same path location /crawlee-python { proxy_pass https://apify.github.io/crawlee-python/;