From 7d897c40b7bb74f57da6380f47e4a4af6a27b00c Mon Sep 17 00:00:00 2001 From: Mustafa Cagri Ardic Date: Sun, 24 May 2026 18:02:34 +0100 Subject: [PATCH] Add Polars documentation Add a scraper for the Polars Python API reference (https://docs.pola.rs/api/python/stable/reference/), pinned to the stable 1.41.0 release. The site uses the pydata-sphinx-theme, so the scraper reuses the shared sphinx/clean_html filter alongside Polars-specific filters: - clean_html removes the theme chrome (sidebars, in-page TOC, prev/next navigation, footer) and tags code blocks for Python highlighting. - entries names each page from its heading and groups entries into types (DataFrame, LazyFrame, Series, Expressions, Functions, Data Types, Input/output, etc.). Top-level members are stored flat under api/ and are classified by their member name. Latest-version detection uses the most recent GitHub release and strips the py- prefix, since the repo also tags Rust (rs-) releases. --- lib/docs/filters/polars/clean_html.rb | 32 +++++++++++++ lib/docs/filters/polars/entries.rb | 64 ++++++++++++++++++++++++++ lib/docs/scrapers/polars.rb | 34 ++++++++++++++ public/icons/docs/polars/16.png | Bin 0 -> 411 bytes public/icons/docs/polars/16@2x.png | Bin 0 -> 247 bytes public/icons/docs/polars/SOURCE | 1 + 6 files changed, 131 insertions(+) create mode 100644 lib/docs/filters/polars/clean_html.rb create mode 100644 lib/docs/filters/polars/entries.rb create mode 100644 lib/docs/scrapers/polars.rb create mode 100644 public/icons/docs/polars/16.png create mode 100644 public/icons/docs/polars/16@2x.png create mode 100644 public/icons/docs/polars/SOURCE diff --git a/lib/docs/filters/polars/clean_html.rb b/lib/docs/filters/polars/clean_html.rb new file mode 100644 index 0000000000..e270c8d985 --- /dev/null +++ b/lib/docs/filters/polars/clean_html.rb @@ -0,0 +1,32 @@ +module Docs + class Polars + class CleanHtmlFilter < Filter + def call + # Remove pydata-sphinx-theme chrome that survives the container extraction + # or sits inside the article (sidebars, in-page TOC, prev/next nav, footer). + css( + '.bd-sidebar-primary', + '.bd-sidebar-secondary', + '.bd-toc', + '.bd-header-article', + '.prev-next-area', + '.prev-next-footer', + '.bd-footer', + '.headerlink', + 'form' + ).remove + + # Drop banner/logo imagery on the landing page. + css('img').remove if root_page? + + # Make sure every code block is tagged so Prism highlights it as Python. + css('.highlight pre').each do |node| + node.content = node.content + node['data-language'] = 'python' + end + + doc + end + end + end +end diff --git a/lib/docs/filters/polars/entries.rb b/lib/docs/filters/polars/entries.rb new file mode 100644 index 0000000000..ab881f40ce --- /dev/null +++ b/lib/docs/filters/polars/entries.rb @@ -0,0 +1,64 @@ +module Docs + class Polars + class EntriesFilter < Docs::EntriesFilter + # Map the leading path segment of a reference page to a human readable + # type. The Polars reference is laid out as
/... under the base + # url (e.g. dataframe/api/polars.DataFrame.count.html). Top-level members + # (plain functions, datatypes, IO, config, ...) instead live flat under + # api/ and are classified by name in #classify_api. + SECTION_TYPES = { + 'dataframe' => 'DataFrame', + 'lazyframe' => 'LazyFrame', + 'series' => 'Series', + 'expressions' => 'Expressions', + 'functions' => 'Functions', + 'selectors' => 'Selectors', + 'datatypes' => 'Data Types', + 'datatype_expr' => 'Data Types', + 'config' => 'Config', + 'io' => 'Input/output', + 'sql' => 'SQL', + 'exceptions' => 'Exceptions', + 'testing' => 'Testing', + 'catalog' => 'Catalog', + 'metadata' => 'Metadata', + 'schema' => 'Schema', + 'plugins' => 'Plugins' + }.freeze + + def get_name + name = at_css('h1').content.strip + # This runs before clean_html removes the headerlink, so strip its + # anchor character off the heading. + name.sub! %r{\s*[#\u{00B6}]+\s*\z}, '' + name + end + + def get_type + return 'Manual' if root_page? + segment = slug.split('/').first + return classify_api(get_name) if segment == 'api' + SECTION_TYPES[segment] || 'Manual' + end + + private + + # Members stored flat under api/ (top-level polars.* objects). + def classify_api(name) + case name + when %r{\Apolars\.datatypes\.} then 'Data Types' + when %r{\Apolars\.Config\b} then 'Config' + when %r{\Apolars\.exceptions\.} then 'Exceptions' + when %r{\Apolars\.testing\.} then 'Testing' + when %r{\Apolars\.(api|plugins)\.} then 'Plugins' + when %r{\Apolars\.io\.} then 'Input/output' + when %r{\Apolars\.DataFrame\.} then 'DataFrame' + when %r{\Apolars\.LazyFrame\.} then 'LazyFrame' + when %r{\Apolars\.(read_|scan_|write_|from_)}, %r{\Apolars\.json_normalize\b} + 'Input/output' + else 'Functions' + end + end + end + end +end diff --git a/lib/docs/scrapers/polars.rb b/lib/docs/scrapers/polars.rb new file mode 100644 index 0000000000..fa332be7ca --- /dev/null +++ b/lib/docs/scrapers/polars.rb @@ -0,0 +1,34 @@ +module Docs + class Polars < UrlScraper + self.name = 'Polars' + self.type = 'sphinx' + self.release = '1.41.0' + self.base_url = 'https://docs.pola.rs/api/python/stable/reference/' + self.root_path = 'index.html' + self.links = { + home: 'https://pola.rs/', + code: 'https://github.com/pola-rs/polars' + } + + html_filters.push 'polars/entries', 'sphinx/clean_html', 'polars/clean_html' + + # pydata-sphinx-theme keeps the page content in the article body. + options[:container] = 'article.bd-article' + + options[:skip_patterns] = [/_changelog/, /whatsnew/] + + # https://github.com/pola-rs/polars/blob/main/LICENSE + options[:attribution] = <<-HTML + © 2020 Ritchie Vink
+ © 2022 Polars contributors
+ Licensed under the MIT License. + HTML + + # Polars tags both Rust (rs-*) and Python (py-*) releases in the same repo. + # The tags API only lists recent Rust ones, but the latest GitHub release is + # always the Python one, so use that and drop the py- prefix. + def get_latest_version(opts) + get_latest_github_release('pola-rs', 'polars', opts).sub(/\Apy-/, '') + end + end +end diff --git a/public/icons/docs/polars/16.png b/public/icons/docs/polars/16.png new file mode 100644 index 0000000000000000000000000000000000000000..c005950aa82b39cf7ac613c5a488966c0536edde GIT binary patch literal 411 zcmV;M0c8G(P)J**&`VRr z_aU&7y0#1O3@?&+f(sbMQQXEdp5ohH43_XPO~(?qVFo`l{VrDU2m@?lvtR}%FpiNV zV1K3-OyE7%aTb@bhSDUcX*>`HOVME@O>=mR`Diqa%cvEn@Fj^JUS{e!d<~N`0k2rZ zHGIT_l8xCEUJdOQpQ1w>fGb(g;dO>C6uiZq0GtfKT9m(tg8^EOHb?ya-2+S&EZ})4 z9*)v*93P_1ShQ$x8t?EKZ)(olPr*H$&sqDpnZhPOqqvS+SVh0!Rao?*{&?u#=uq9u zYJ)?Wd>@lZY~hE$kFD#qvmVq?pA5b2i=*=$evvyE{Q^1GUku6qxtRa}002ovPDHLk FV1k+lw4K