Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 97 additions & 0 deletions .github/workflows/test-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,103 @@ jobs:
APIFY_SIGNING_TOKEN: ${{ secrets.APIFY_SIGNING_TOKEN }}
SEGMENT_TOKEN: ${{ secrets.SEGMENT_TOKEN }}

- name: Install Nginx
run: |
sudo apt-get update
sudo apt-get install -y nginx

- name: Start Docusaurus server
run: |
cd website
nohup yarn docusaurus serve --port 3000 --no-open &
sleep 5
curl -f http://localhost:3000 > /dev/null

- name: Start Nginx with project config
run: |
PWD_PATH="$(pwd)"
cat > default.conf <<EOF
worker_processes auto;
error_log ${PWD_PATH}/logs/error.log;
pid ${PWD_PATH}/logs/nginx.pid;
events {}
http {
access_log ${PWD_PATH}/logs/access.log;
include ${PWD_PATH}/website/nginx.conf;
}
EOF
sed -i 's|https://apify.github.io/crawlee|http://localhost:3000|g' default.conf
mkdir -p "${PWD_PATH}/logs"
nginx -c "${PWD_PATH}/default.conf"
sleep 1

- name: Run header assertions
run: |
set -euo pipefail
function assert_header() {
url=$1
header=$2
expected=$3
shift 3
extra_args=("$@")
actual=$(curl -s -D - -o /dev/null "${extra_args[@]}" "$url" | grep -i "^$header" | tr -d '\r' || true)
echo "→ $url → $actual"
echo "$actual" | grep -q "$expected" || (echo "❌ Expected '$expected' in '$header' for $url" && exit 1)
}

function assert_status() {
url=$1
expected=$2
shift 2
extra_args=("$@")
actual=$(curl -s -o /dev/null -w "%{http_code}" "${extra_args[@]}" "$url")
echo "→ $url → HTTP $actual"
[ "$actual" = "$expected" ] || (echo "❌ Expected HTTP $expected but got $actual for $url" && exit 1)
}

function assert_no_redirect() {
url=$1
shift
extra_args=("$@")
response=$(curl -s -D - -o /dev/null -w "\n%{http_code}" "${extra_args[@]}" "$url" 2>/dev/null)
status=$(echo "$response" | tail -1)
location=$(echo "$response" | grep -i "^location:" | tr -d '\r' || true)
echo "→ $url → HTTP $status ${location:+(${location})}"
if [ "$status" = "301" ] || [ "$status" = "302" ]; then
echo "❌ Got redirect for $url: $location" && exit 1
fi
}

echo "🧪 Checking open redirect protection..."
assert_no_redirect "http://localhost:8080///%5Cevil.com/"
assert_no_redirect "http://localhost:8080/%5Cevil.com/"
assert_no_redirect "http://localhost:8080///%5cevil.com/"
assert_no_redirect "http://localhost:8080" --request-target '/\evil.com/'
assert_no_redirect "http://localhost:8080" --request-target '///\evil.com/'
assert_status "http://localhost:8080/js/docs/quick-start/" "302"

echo "🧪 Checking Nginx responses... (crawlee JS)"
assert_header "http://localhost:8080/" "Content-Type" "text/html"
assert_header "http://localhost:8080/" "Content-Type" "text/markdown" -H "Accept: text/markdown"
assert_header "http://localhost:8080/js/docs/quick-start" "Content-Type" "text/html"
assert_header "http://localhost:8080/js/docs/quick-start.md" "Content-Type" "text/markdown"
assert_header "http://localhost:8080/js/docs/quick-start" "Content-Type" "text/markdown" -H "Accept: text/markdown"
assert_header "http://localhost:8080/llms.txt" "Content-Type" "text/markdown"
assert_header "http://localhost:8080/llms-full.txt" "Content-Type" "text/markdown"

echo "🧪 Checking Nginx responses... (crawlee Python)"
assert_header "http://localhost:8080/python/docs/quick-start" "Content-Type" "text/html"
assert_header "http://localhost:8080/python/docs/quick-start.md" "Content-Type" "text/markdown"
assert_header "http://localhost:8080/python/docs/quick-start" "Content-Type" "text/markdown" -H "Accept: text/markdown"
assert_header "http://localhost:8080/python/llms.txt" "Content-Type" "text/markdown"
assert_header "http://localhost:8080/python/llms-full.txt" "Content-Type" "text/markdown"

echo "✅ All Nginx header checks passed."

- name: Stop Nginx
if: always()
run: nginx -c "$(pwd)/default.conf" -s stop

lint:
name: Lint
runs-on: ubuntu-22.04
Expand Down
77 changes: 75 additions & 2 deletions website/nginx.conf
Original file line number Diff line number Diff line change
@@ -1,23 +1,96 @@
map $http_accept $serve_markdown {
default 0;
~*text/plain 1;
~*text/markdown 1;
}

map $request_uri $has_no_extension {
~^[^.]*$ 1;
default 0;
}

# Nginx reverse proxy configuration for crawlee.dev
# Routes to GitHub Pages and handles legacy URL redirects
server {
listen 0.0.0.0:8080;
server_name 'crawlee.dev';

# comment out the resolver and use localhost:3000 for local development
set $backend "https://apify.github.io/crawlee";
resolver 1.1.1.1 8.8.8.8 valid=30s ipv6=off;

# Health check endpoint
location /health {
access_log off;
return 200 '{"status":"UP"}';
add_header Content-Type application/json;
}

location = / {
if ($serve_markdown) {
rewrite ^ /llms.txt last;
}
proxy_pass $backend/;
}

location ~ ^/(llms|llms-full)\.txt$ {
proxy_hide_header Content-Type;
add_header Content-Type 'text/markdown; charset=utf-8' always;
proxy_pass $backend$uri;
}

# remove trailing slashes from all URLs (except root /)
# exact match locations (e.g., location = /python/) take priority over this regex
# Only match URIs composed of safe characters (letters, digits, dots, hyphens,
# underscores, forward slashes). This prevents open redirect via %5C (backslash):
# nginx decodes %5C to \ in $uri, and \ in the Location header gets normalized
# to / by browsers, turning /\evil.com into //evil.com (protocol-relative URL).
location ~ ^(/[a-zA-Z0-9][a-zA-Z0-9_./-]*)/$ {
rewrite ^(.+)/$ $1$is_args$args? redirect;
}

location / {
proxy_pass https://apify.github.io/crawlee/;
set $rewrite_condition "$serve_markdown$has_no_extension";
set $proxy_path $request_uri;

if ($rewrite_condition = "11") {
set $proxy_path "${request_uri}.md";
}
proxy_pass $backend$proxy_path;
}

### Repository path: "/python"

location = /python {
if ($serve_markdown) {
rewrite ^ /python/llms.txt last;
}
proxy_pass https://apify.github.io/crawlee-python/;
}
location /python {

location = /python/ {
if ($serve_markdown) {
rewrite ^ /python/llms.txt last;
}
proxy_pass https://apify.github.io/crawlee-python/;
}

location ~ ^/python/(llms|llms-full)\.txt$ {
proxy_hide_header Content-Type;
add_header Content-Type 'text/markdown; charset=utf-8' always;
proxy_pass https://apify.github.io/crawlee-python/$1.txt;
}

location ~ ^/python/(.*)$ {
set $path_suffix $1;
set $proxy_path "/$path_suffix";
set $rewrite_condition "$serve_markdown$has_no_extension";
if ($rewrite_condition = "11") {
set $proxy_path "${proxy_path}.md";
}
proxy_pass https://apify.github.io/crawlee-python$proxy_path;
}

# So that we can have both GH pages and crawlee.dev/python working and loading assets from the same path
location /crawlee-python {
proxy_pass https://apify.github.io/crawlee-python/;
Expand Down
Loading