diff --git a/docs/cloud/features/02_integrations/02_data_catalogs.md b/docs/cloud/features/02_integrations/02_data_catalogs.md index 65b868f4a12..f65309be252 100644 --- a/docs/cloud/features/02_integrations/02_data_catalogs.md +++ b/docs/cloud/features/02_integrations/02_data_catalogs.md @@ -18,20 +18,25 @@ Through the integration, your catalog's tables will appear as queryable database Setup is available both via SQL command ([DataLakeCatalog](/engines/database-engines/datalakecatalog)) and via the ClickHouse Cloud UI on the Data Sources tab. Using the UI: + - Simplifies setup with a form using fields consistent with your Data Catalog objects - Provides a single interface for active data catalog integrations - Tests connections and credentials when saving ClickHouse Cloud UI with data catalog integrations -| Name | Open Table Format Supported | Auth Method | Support | Version | -|------|-----------------------------|----------------------------------------|---------|---------| -| AWS Glue Catalog | Iceberg | IAM/Access keys | Cloud & [Core](/use-cases/data-lake/glue-catalog) | 25.10+ | -| Lakekeeper | Iceberg | OAuth client credentials | [Core](/use-cases/data-lake/lakekeeper-catalog) | 25.10+ | -| Microsoft OneLake | Iceberg | Azure Active Directory (AAD) | Cloud & [Core](/use-cases/data-lake/onelake-catalog) | 25.12+ | -| Nessie | Iceberg | OAuth client credentials | [Core](/use-cases/data-lake/nessie-catalog) | 25.10+ | -| Polaris/Open Catalog | Iceberg | OAuth client credentials | [Core](/use-cases/data-lake/polaris-catalog) | 26.1+ | -| REST catalog | Iceberg | OAuth client credentials, Bearer token | Cloud & [Core](/use-cases/data-lake/rest-catalog) | 25.10+ | -| Unity Catalog | Iceberg (UniForm-enabled and managed), Delta | OAuth client credentials | Cloud (Iceberg only) & [Core](/use-cases/data-lake/unity-catalog) | 25.10+ | - -We have more catalogs planned, including Horizon and S3 tables REST endpoint. +For a step-by-step Cloud UI walkthrough, see [Connect a data catalog in ClickHouse Cloud](/integrations/data-catalogs). + +| Name | Open table format | Auth method | Cloud | Core | Version | +| ----------------- | ----------------- | -------------------------------------- | ---------------------------------------------------------- | ------------------------------------------------ | ------- | +| AWS Glue Catalog | Iceberg | IAM role (26.2+), Access keys | [Guide](/integrations/data-catalogs?catalog=aws-glue#add-your-catalog-connection) | [Guide](/use-cases/data-lake/glue-catalog) | 25.10+ | +| BigLake Metastore | Iceberg | Google ADC (OAuth) | [Guide](/integrations/data-catalogs?catalog=biglake#add-your-catalog-connection) | [Guide](/use-cases/data-lake/biglake-catalog) | 26.2+ | +| Lakekeeper | Iceberg | OAuth client credentials | — | [Guide](/use-cases/data-lake/lakekeeper-catalog) | 25.10+ | +| Microsoft OneLake | Iceberg | Azure Active Directory (AAD) | [Guide](/integrations/data-catalogs?catalog=onelake#add-your-catalog-connection) | [Guide](/use-cases/data-lake/onelake-catalog) | 25.12+ | +| Nessie | Iceberg | OAuth client credentials | — | [Guide](/use-cases/data-lake/nessie-catalog) | 25.10+ | +| Polaris | Iceberg | OAuth client credentials | [Guide](/integrations/data-catalogs?catalog=polaris#add-your-catalog-connection) | [Guide](/use-cases/data-lake/polaris-catalog) | 26.1+ | +| REST catalog | Iceberg | OAuth client credentials, Bearer token | [Guide](/integrations/data-catalogs?catalog=rest#add-your-catalog-connection) | [Guide](/use-cases/data-lake/rest-catalog) | 25.10+ | +| Unity Catalog | Iceberg | OAuth client credentials | [Guide](/integrations/data-catalogs?catalog=unity-iceberg#add-your-catalog-connection) | [Guide](/use-cases/data-lake/unity-catalog) | 25.10+ | +| Unity Catalog | Delta | Personal Access Token (PAT) | [Guide](/integrations/data-catalogs?catalog=unity-delta#add-your-catalog-connection) | [Guide](/use-cases/data-lake/unity-catalog) | 25.10+ | + +We have more catalogs planned, including Horizon and S3 tables REST endpoint. \ No newline at end of file diff --git a/docs/integrations/data-catalogs/index.md b/docs/integrations/data-catalogs/index.md new file mode 100644 index 00000000000..a08afb8df01 --- /dev/null +++ b/docs/integrations/data-catalogs/index.md @@ -0,0 +1,257 @@ +--- +sidebar_label: 'Connect a data catalog' +description: 'Connect an external data catalog to ClickHouse Cloud via the Data sources UI.' +slug: /integrations/data-catalogs +title: 'Connect a data catalog in ClickHouse Cloud' +doc_type: 'guide' +keywords: ['data catalogs', 'data lake', 'iceberg', 'unity catalog', 'glue', 'delta lake', 'onelake', 'polaris', 'biglake', 'clickhouse cloud'] +integration: + - support_level: 'core' + - category: 'data_lake' +--- + +import Image from '@theme/IdealImage'; +import BetaBadge from '@theme/badges/BetaBadge'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import catalog_flyout_select from '@site/static/images/integrations/data-catalogs/catalog-flyout-select.png'; +import linked_catalogs_table from '@site/static/images/integrations/data-catalogs/linked-catalogs-table.png'; +import catalog_tables_browser from '@site/static/images/integrations/data-catalogs/catalog-tables-browser.png'; +import catalog_sql_query from '@site/static/images/integrations/data-catalogs/catalog-sql-query.png'; +import data_catalogs_ui from '@site/static/images/cloud/features/data-catalogs-ui.png'; + + + +# Connect a data catalog in ClickHouse Cloud + +Connect ClickHouse Cloud to your data catalogs to access your open table format tables. You can set up connections in the **Data sources** UI. For setup via SQL, use the [`DataLakeCatalog`](/engines/database-engines/datalakecatalog) database engine in your SQL editor of choice. + +ClickHouse Cloud UI with data catalog integrations + +Once connected, catalog tables show up in the SQL console under the database name you choose. You can query them with standard ClickHouse SQL, join them with [MergeTree](/engines/table-engines/mergetree-family/mergetree) tables, and use them as sources for [materialized views](/materialized-views). + +## Prerequisites {#prerequisites} + +Before you connect a catalog, confirm the following: + +- **Service permissions.** You need the `control-plane:service:manage` permission to access the **Data sources** page and add catalogs. +- **Running service.** If the service is idle, wake it from the **Data sources** page or the service overview before connecting or viewing linked catalogs. +- **Catalog credentials.** Gather connection details for your catalog type before opening the flyout. Each catalog uses different fields and authentication — see [Add your catalog connection](#add-your-catalog-connection) below. + +## Connect your catalog {#connect-your-catalog} + +Make sure you're logged in to your [ClickHouse Cloud](https://cloud.clickhouse.com/) account. + + + +### Open the data catalog flyout {#open-flyout} + +1. In the console, open the ClickHouse Cloud service you want to connect. +2. Select **Data sources** in the left navigation. +3. Click **+ Add catalog** if you haven't set up any data sources. Otherwise, click **Add data source** > **Add data lake catalog**. +4. In the **Connect your data catalog** flyout, select your catalog from the **Select catalog** dropdown. If the catalog supports multiple open table formats, choose the format in **Open table format**. + +Connect your data catalog flyout with Select catalog dropdown showing AWS Glue, BigLake Metastore, Microsoft OneLake, Polaris, REST Catalog, and Unity Catalog + +### Add your catalog connection {#add-your-catalog-connection} + +Select your catalog below for field-level guidance and prerequisites, then fill in the connection parameters and a **Database name**. The **Database name** is the ClickHouse database that exposes your catalog tables in the SQL console. + + + + +[AWS Glue Catalog](/use-cases/data-lake/glue-catalog) exposes [Iceberg](/engines/table-engines/integrations/iceberg) tables registered in the Glue Data Catalog. + +Before you connect, confirm: + +- ClickHouse version 25.12+. +- Iceberg tables are registered in AWS Glue Data Catalog in your target region. +- For access key authentication, you have an IAM user access key with permissions to read Glue metadata and the underlying S3 objects. +- For IAM role authentication (26.2+), you have an IAM role that trusts your ClickHouse service role. Include the service role ARN from **Settings → Network security information** in the role trust policy. See [Accessing Iceberg data securely](/cloud/data-sources/secure-iceberg) for IAM policy examples. + +In the flyout, enter your AWS **Region** (e.g. `us-west-2`), then choose an authentication method: + +**Access key authentication** + +1. Select **AWS Access Key** as the **Authentication method**. +2. Enter your **Access Key ID** and **Secret Access Key**. +3. Enter a **Database name** for the ClickHouse database that exposes your Glue tables. + +**IAM role authentication (26.2+)** + +1. Select **AWS IAM Role** as the **Authentication method**. +2. Copy the **Service role ID (IAM)** from the flyout panel and add it to your IAM role trust policy. +3. Enter your **AWS Role ARN** and an optional **AWS Role Session Name**. +4. Enter a **Database name** for the ClickHouse database that exposes your Glue tables. + +:::note +Glue supports multiple table formats, but ClickHouse only reads **Iceberg** tables from Glue. +::: + +
+
+ + +Query Unity Catalog managed [Iceberg](/engines/table-engines/integrations/iceberg) tables using OAuth client credentials from a Databricks service principal. See the [Unity Catalog guide](/use-cases/data-lake/unity-catalog#read-iceberg) for full setup. + +Before you connect, confirm: + +- ClickHouse version 25.12+. +- [Unity Catalog is configured for external data access](https://docs.databricks.com/aws/en/external-access/admin). +- Databricks [service principal](https://docs.databricks.com/aws/en/dev-tools/auth/oauth-m2m) with OAuth client ID and secret. The service principal has `USE CATALOG`, `USE SCHEMA`, `USE EXTERNAL SCHEMA` and `SELECT` privileges on the tables you want to query. + +In the flyout: + +1. Enter your Databricks **Workspace URL** (e.g. `dbc-1234567a-cbde`). +2. Enter the **Databricks catalog name** to connect (e.g. `icebench`). +3. Enter the OAuth **Client ID** and **Client secret** for your service principal. +4. Enter a **Database name** for the ClickHouse database that exposes your Unity Catalog tables. +
+
+ + +Query Unity Catalog [Delta Lake](/engines/table-engines/integrations/deltalake) tables using a Databricks Personal Access Token (PAT). See the [Unity Catalog guide](/use-cases/data-lake/unity-catalog#read-delta) for full setup. + +Before you connect, confirm: + +- ClickHouse version 25.12+. +- [Unity Catalog is configured for external data access](https://docs.databricks.com/aws/en/external-access/admin). +- Databricks [Personal Access Token](https://docs.databricks.com/aws/en/dev-tools/auth/pat) with at least `EXTERNAL USE SCHEMA`, `USE CATALOG`, `USE SCHEMA`, and `SELECT` on the target tables. + +In the flyout: + +1. Enter your Databricks **Workspace URL** (e.g. `dbc-1234567a-cbde.azuredatabricks.net`). +2. Enter the **Databricks catalog name** to connect. +3. Enter your **Personal Access Token**. +4. Enter a **Database name** for the ClickHouse database that exposes your Delta tables. + +:::note +Iceberg and Delta use different authentication in the UI. This will require two separate ClickHouse databases to access both types of tables. +::: +
+
+ + +Connect to any catalog that implements the [Iceberg REST Catalog](https://github.com/apache/iceberg/blob/main/open-api/rest-catalog-open-api.yaml) specification. See the [REST catalog guide](/use-cases/data-lake/rest-catalog) for full setup. + +Before you connect, confirm: + +- ClickHouse version 25.12+. +- Your REST catalog endpoint is reachable from ClickHouse Cloud. +- You have OAuth client credentials or a bearer token, depending on your catalog configuration. +- You have an S3 or compatible **Storage Endpoint** URI for table data (e.g. `s3://my-bucket/path`). + +In the flyout: + +1. Enter the **Catalog URL** (e.g. `https://catalog.example.com/v1`). +2. Enter the **Warehouse** or catalog namespace (e.g. `demo`). +3. Enter the **Storage Endpoint** URI prefix for table storage. +4. Select an **Authentication method**: **OAuth Client Credentials** or **Bearer Token**, then enter the matching credentials. +5. Enter a **Database name** for the ClickHouse database that exposes your REST catalog tables. +
+
+ + +Query [Iceberg](/engines/table-engines/integrations/iceberg) tables in Microsoft Fabric OneLake using Azure AD application credentials. See the [Fabric OneLake guide](/use-cases/data-lake/onelake-catalog) for full setup. + +Before you connect, confirm: + +- ClickHouse version 25.12+. +- Iceberg tables exist in a Fabric workspace. +- You have an Entra ID (Azure AD) application with client ID and secret. +- You have your tenant ID, workspace ID, and a data item ID. Use your **Lakehouse ID** from the Lakehouse page URL. See [Microsoft OneLake prerequisites](https://learn.microsoft.com/en-us/fabric/onelake/table-apis/table-apis-overview#prerequisites) for help locating these values. + +In the flyout: + +1. Enter your Fabric **Workspace ID**. +2. Enter the **Data Item ID** — use your Lakehouse GUID. Warehouse IDs are not supported. +3. Enter your Entra ID **Tenant ID**, **Application (client) ID**, and **Client secret**. +4. Enter a **Database name** for the ClickHouse database that exposes your OneLake tables. +
+
+ + +Connect to a Snowflake Open Catalog (Polaris) deployment for [Iceberg](/engines/table-engines/integrations/iceberg) tables. See the [Polaris catalog guide](/use-cases/data-lake/polaris-catalog) for full setup. + +Before you connect, confirm: + +- ClickHouse version 26.2+. +- You have a Polaris catalog with OAuth client credentials. +- You have a storage endpoint URI for Iceberg table data (e.g. `s3://company-iceberg-prod/warehouse/`). + +In the flyout: + +1. Enter the **Catalog Account Identifier** (e.g. `ab12345.snowflakecomputing.com`). +2. Enter the **Catalog Name** (e.g. `snowflake_open_catalog`). +3. Enter the OAuth **Client ID** and **Client Secret**. +4. Enter the **Storage Endpoint** URI prefix for table storage. +5. Enter a **Database name** for the ClickHouse database that exposes your Polaris tables. +
+
+ + +Connect to Google Cloud BigLake Metastore (aka Lakehouse runtime catalog) for [Iceberg](/engines/table-engines/integrations/iceberg) tables in GCS. See the [BigLake Metastore guide](/use-cases/data-lake/biglake-catalog) for full setup. + +Before you connect, confirm: + +- ClickHouse version 26.2+. +- You have a BigLake Metastore instance with Iceberg tables in GCS. +- You have Google Application Default Credentials (ADC) with client ID, client secret, refresh token, and quota project ID. + +In the flyout: + +1. Enter your **Google ADC Client ID**, **Client Secret**, **Refresh Token**, and **Quota Project ID**. +2. Enter the **Cloud Storage Bucket** URI for table data (e.g. `gs://biglake-public-nyc-taxi-iceberg`). +3. Enter a **Database name** for the ClickHouse database that exposes your BigLake tables. +
+
+ +
+
+ +### Save the connection {#save-connection} + +After filling in the fields: + +1. Click **Add catalog**. ClickHouse validates the connection and credentials when saving. +2. On success, a confirmation toast appears with a **View in SQL console** link. Your catalog is listed in the **Linked catalogs** table with its connection status and table count. + +From the **Actions** menu on a linked catalog row, you can drop the catalog connection. Dropping removes the ClickHouse database binding — it does not delete data in your external catalog. + +
+ +## Query your data {#query-data} + + + +### View your catalog tables {#view-tables} + +On the **Data sources** page, find your catalog in the **Linked catalogs** table and click **View tables**. + +Linked catalogs table with View tables action + +ClickHouse opens the SQL console with your catalog database selected and lists the available tables. + +SQL console table browser showing catalog tables + +### Run a query {#run-a-query} + +Write a query in the SQL editor and click **Run**. Wrap the full table name in backticks: + +```sql +SELECT * FROM `identity_profiles.identity_profiles_iceberg` +``` + +SQL query with results using backticks for dotted table name + + + +## Troubleshooting {#troubleshooting} + +- If you don't see your tables in the SQL console: verify credentials, network access, and table types in the catalog. Make sure the tables you expect to see are in supported file and table formats. +- Open up a support ticket if you aren't able to debug. + +## See also {#see-also} + +- [Accelerating analytics on lakehouse data](/use-cases/data-lake/getting-started/accelerating-analytics) — load catalog tables into MergeTree for repeated queries +- [Accessing Iceberg data securely](/cloud/data-sources/secure-iceberg) — IAM role setup for AWS Iceberg and Glue access diff --git a/sidebars.js b/sidebars.js index 290a020fb6a..cbd77b44921 100644 --- a/sidebars.js +++ b/sidebars.js @@ -1024,6 +1024,7 @@ const sidebars = { 'integrations/data-sources/cassandra', 'integrations/data-ingestion/gcs/index', 'integrations/data-ingestion/s3-minio', + 'integrations/data-catalogs/index', 'integrations/data-ingestion/emqx/index', 'integrations/data-ingestion/insert-local-files', 'integrations/data-ingestion/dbms/jdbc-with-clickhouse', diff --git a/static/images/integrations/data-catalogs/catalog-flyout-select.png b/static/images/integrations/data-catalogs/catalog-flyout-select.png new file mode 100644 index 00000000000..801313e86f5 Binary files /dev/null and b/static/images/integrations/data-catalogs/catalog-flyout-select.png differ diff --git a/static/images/integrations/data-catalogs/catalog-sql-query.png b/static/images/integrations/data-catalogs/catalog-sql-query.png new file mode 100644 index 00000000000..8d02ad6c471 Binary files /dev/null and b/static/images/integrations/data-catalogs/catalog-sql-query.png differ diff --git a/static/images/integrations/data-catalogs/catalog-tables-browser.png b/static/images/integrations/data-catalogs/catalog-tables-browser.png new file mode 100644 index 00000000000..ce39bbc532c Binary files /dev/null and b/static/images/integrations/data-catalogs/catalog-tables-browser.png differ diff --git a/static/images/integrations/data-catalogs/linked-catalogs-table.png b/static/images/integrations/data-catalogs/linked-catalogs-table.png new file mode 100644 index 00000000000..d382ef925cc Binary files /dev/null and b/static/images/integrations/data-catalogs/linked-catalogs-table.png differ