From 86083cf8ffcff642459c4f0db0701b39074a793a Mon Sep 17 00:00:00 2001 From: Anant Vindal Date: Tue, 5 May 2026 12:49:26 +0530 Subject: [PATCH 01/18] fix: hottier downloads Make hottier downloads streaming --- src/hottier.rs | 7 ++-- src/metrics/mod.rs | 29 ++++++++++++++++ src/storage/azure_blob.rs | 61 +++++++++++++++++++++++++++++++-- src/storage/gcs.rs | 61 +++++++++++++++++++++++++++++++-- src/storage/localfs.rs | 13 +++++++ src/storage/object_storage.rs | 6 ++++ src/storage/s3.rs | 64 +++++++++++++++++++++++++++++++++-- 7 files changed, 229 insertions(+), 12 deletions(-) diff --git a/src/hottier.rs b/src/hottier.rs index c472f91c5..c6172d966 100644 --- a/src/hottier.rs +++ b/src/hottier.rs @@ -42,7 +42,6 @@ use relative_path::RelativePathBuf; use std::time::Duration; use sysinfo::Disks; use tokio::fs::{self, DirEntry}; -use tokio::io::AsyncWriteExt; use tokio_stream::wrappers::ReadDirStream; use tracing::{error, warn}; @@ -438,13 +437,11 @@ impl HotTierManager { } let parquet_file_path = RelativePathBuf::from(parquet_file.file_path.clone()); fs::create_dir_all(parquet_path.parent().unwrap()).await?; - let mut file = fs::File::create(parquet_path.clone()).await?; - let parquet_data = PARSEABLE + PARSEABLE .storage .get_object_store() - .get_object(&parquet_file_path, tenant_id) + .buffered_write(&parquet_file_path, tenant_id, parquet_path) .await?; - file.write_all(&parquet_data).await?; *parquet_file_size += parquet_file.file_size; stream_hot_tier.used_size = *parquet_file_size; diff --git a/src/metrics/mod.rs b/src/metrics/mod.rs index 9c9ff86fa..c7689863a 100644 --- a/src/metrics/mod.rs +++ b/src/metrics/mod.rs @@ -306,6 +306,19 @@ pub static TOTAL_FILES_SCANNED_IN_OBJECT_STORE_CALLS_BY_DATE: Lazy = + Lazy::new(|| { + IntCounterVec::new( + Opts::new( + "partial_file_scans_in_object_store_calls_by_date", + "Partial file scans in object store calls by date", + ) + .namespace(METRICS_NAMESPACE), + &["method", "date", "tenant_id"], + ) + .expect("metric can be created") + }); + pub static TOTAL_BYTES_SCANNED_IN_OBJECT_STORE_CALLS_BY_DATE: Lazy = Lazy::new(|| { IntCounterVec::new( @@ -527,6 +540,11 @@ fn custom_metrics(registry: &Registry) { TOTAL_FILES_SCANNED_IN_OBJECT_STORE_CALLS_BY_DATE.clone(), )) .expect("metric can be registered"); + registry + .register(Box::new( + PARTIAL_FILE_SCANS_IN_OBJECT_STORE_CALLS_BY_DATE.clone(), + )) + .expect("metric can be registered"); registry .register(Box::new( TOTAL_BYTES_SCANNED_IN_OBJECT_STORE_CALLS_BY_DATE.clone(), @@ -695,6 +713,17 @@ pub fn increment_object_store_calls_by_date(method: &str, date: &str, tenant_id: .inc(); } +pub fn increment_partial_file_scans_in_object_store_calls_by_date( + method: &str, + count: u64, + date: &str, + tenant_id: &str, +) { + PARTIAL_FILE_SCANS_IN_OBJECT_STORE_CALLS_BY_DATE + .with_label_values(&[method, date, tenant_id]) + .inc_by(count); +} + pub fn increment_files_scanned_in_object_store_calls_by_date( method: &str, count: u64, diff --git a/src/storage/azure_blob.rs b/src/storage/azure_blob.rs index 62abad0db..78271b29e 100644 --- a/src/storage/azure_blob.rs +++ b/src/storage/azure_blob.rs @@ -16,7 +16,12 @@ * */ -use std::{collections::HashSet, path::Path, sync::Arc, time::Duration}; +use std::{ + collections::HashSet, + path::{Path, PathBuf}, + sync::Arc, + time::Duration, +}; use async_trait::async_trait; use bytes::Bytes; @@ -38,7 +43,10 @@ use object_store::{ path::Path as StorePath, }; use relative_path::{RelativePath, RelativePathBuf}; -use tokio::{fs::OpenOptions, io::AsyncReadExt}; +use tokio::{ + fs::OpenOptions, + io::{AsyncReadExt, AsyncWriteExt, BufWriter}, +}; use tracing::error; use url::Url; @@ -47,6 +55,7 @@ use crate::{ increment_bytes_scanned_in_object_store_calls_by_date, increment_files_scanned_in_object_store_calls_by_date, increment_object_store_calls_by_date, + increment_partial_file_scans_in_object_store_calls_by_date, }, parseable::{DEFAULT_TENANT, LogStream, PARSEABLE}, }; @@ -204,6 +213,46 @@ pub struct BlobStore { } impl BlobStore { + async fn _stream_to_file( + &self, + path: &RelativePath, + tenant_id: &Option, + write_path: PathBuf, + ) -> Result<(), ObjectStorageError> { + let tenant_str = tenant_id.as_deref().unwrap_or(DEFAULT_TENANT); + let date = Utc::now().date_naive().to_string(); + let resp = self.client.get(&to_object_store_path(path)).await; + increment_object_store_calls_by_date("GET", &date, tenant_str); + let resp = resp?; + + if let Some(parent) = write_path.parent() { + tokio::fs::create_dir_all(parent).await?; + } + let file = tokio::fs::File::create(&write_path).await?; + let mut writer = BufWriter::new(file); + let mut stream = resp.into_stream(); + let mut total: u64 = 0; + let mut chunk_count: u64 = 0; + while let Some(chunk) = stream.next().await { + let chunk = chunk?; + total += chunk.len() as u64; + chunk_count += 1; + writer.write_all(&chunk).await?; + } + writer.flush().await?; + writer.into_inner().sync_all().await?; + + increment_files_scanned_in_object_store_calls_by_date("GET", 1, &date, tenant_str); + increment_bytes_scanned_in_object_store_calls_by_date("GET", total, &date, tenant_str); + increment_partial_file_scans_in_object_store_calls_by_date( + "GET", + chunk_count, + &date, + tenant_str, + ); + Ok(()) + } + async fn _get_object( &self, path: &RelativePath, @@ -466,6 +515,14 @@ impl BlobStore { #[async_trait] impl ObjectStorage for BlobStore { + async fn buffered_write( + &self, + path: &RelativePath, + tenant_id: &Option, + write_path: PathBuf, + ) -> Result<(), ObjectStorageError> { + self._stream_to_file(path, tenant_id, write_path).await + } async fn get_buffered_reader( &self, _path: &RelativePath, diff --git a/src/storage/gcs.rs b/src/storage/gcs.rs index 019802f24..314649116 100644 --- a/src/storage/gcs.rs +++ b/src/storage/gcs.rs @@ -16,13 +16,19 @@ * */ -use std::{collections::HashSet, path::Path, sync::Arc, time::Duration}; +use std::{ + collections::HashSet, + path::{Path, PathBuf}, + sync::Arc, + time::Duration, +}; use crate::{ metrics::{ increment_bytes_scanned_in_object_store_calls_by_date, increment_files_scanned_in_object_store_calls_by_date, increment_object_store_calls_by_date, + increment_partial_file_scans_in_object_store_calls_by_date, }, parseable::{DEFAULT_TENANT, LogStream, PARSEABLE}, }; @@ -46,7 +52,10 @@ use object_store::{ path::Path as StorePath, }; use relative_path::{RelativePath, RelativePathBuf}; -use tokio::{fs::OpenOptions, io::AsyncReadExt}; +use tokio::{ + fs::OpenOptions, + io::{AsyncReadExt, AsyncWriteExt, BufWriter}, +}; use tracing::error; use super::{ @@ -169,6 +178,46 @@ pub struct Gcs { } impl Gcs { + async fn _stream_to_file( + &self, + path: &RelativePath, + tenant_id: &Option, + write_path: PathBuf, + ) -> Result<(), ObjectStorageError> { + let tenant_str = tenant_id.as_deref().unwrap_or(DEFAULT_TENANT); + let date = Utc::now().date_naive().to_string(); + let resp = self.client.get(&to_object_store_path(path)).await; + increment_object_store_calls_by_date("GET", &date, tenant_str); + let resp = resp?; + + if let Some(parent) = write_path.parent() { + tokio::fs::create_dir_all(parent).await?; + } + let file = tokio::fs::File::create(&write_path).await?; + let mut writer = BufWriter::new(file); + let mut stream = resp.into_stream(); + let mut total: u64 = 0; + let mut chunk_count: u64 = 0; + while let Some(chunk) = stream.next().await { + let chunk = chunk?; + total += chunk.len() as u64; + chunk_count += 1; + writer.write_all(&chunk).await?; + } + writer.flush().await?; + writer.into_inner().sync_all().await?; + + increment_files_scanned_in_object_store_calls_by_date("GET", 1, &date, tenant_str); + increment_bytes_scanned_in_object_store_calls_by_date("GET", total, &date, tenant_str); + increment_partial_file_scans_in_object_store_calls_by_date( + "GET", + chunk_count, + &date, + tenant_str, + ); + Ok(()) + } + async fn _get_object( &self, path: &RelativePath, @@ -431,6 +480,14 @@ impl Gcs { #[async_trait] impl ObjectStorage for Gcs { + async fn buffered_write( + &self, + path: &RelativePath, + tenant_id: &Option, + write_path: PathBuf, + ) -> Result<(), ObjectStorageError> { + self._stream_to_file(path, tenant_id, write_path).await + } async fn get_buffered_reader( &self, path: &RelativePath, diff --git a/src/storage/localfs.rs b/src/storage/localfs.rs index 6f981981c..c9a58502c 100644 --- a/src/storage/localfs.rs +++ b/src/storage/localfs.rs @@ -106,6 +106,19 @@ impl LocalFS { #[async_trait] impl ObjectStorage for LocalFS { + async fn buffered_write( + &self, + path: &RelativePath, + _tenant_id: &Option, + write_path: PathBuf, + ) -> Result<(), ObjectStorageError> { + let src = self.path_in_root(path); + if let Some(parent) = write_path.parent() { + fs::create_dir_all(parent).await?; + } + fs::copy(&src, &write_path).await?; + Ok(()) + } async fn upload_multipart( &self, key: &RelativePath, diff --git a/src/storage/object_storage.rs b/src/storage/object_storage.rs index 2e58a0df5..2c6244733 100644 --- a/src/storage/object_storage.rs +++ b/src/storage/object_storage.rs @@ -302,6 +302,12 @@ pub trait ObjectStorage: Debug + Send + Sync + 'static { path: &RelativePath, tenant_id: &Option, ) -> Result; + async fn buffered_write( + &self, + path: &RelativePath, + tenant_id: &Option, + write_path: PathBuf, + ) -> Result<(), ObjectStorageError>; async fn get_object( &self, path: &RelativePath, diff --git a/src/storage/s3.rs b/src/storage/s3.rs index 169d3c508..ba38b532f 100644 --- a/src/storage/s3.rs +++ b/src/storage/s3.rs @@ -17,7 +17,12 @@ */ use std::{ - collections::HashSet, fmt::Display, path::Path, str::FromStr, sync::Arc, time::Duration, + collections::HashSet, + fmt::Display, + path::{Path, PathBuf}, + str::FromStr, + sync::Arc, + time::Duration, }; use async_trait::async_trait; @@ -40,7 +45,10 @@ use object_store::{ path::Path as StorePath, }; use relative_path::{RelativePath, RelativePathBuf}; -use tokio::{fs::OpenOptions, io::AsyncReadExt}; +use tokio::{ + fs::OpenOptions, + io::{AsyncReadExt, AsyncWriteExt, BufWriter}, +}; use tracing::error; use crate::{ @@ -48,6 +56,7 @@ use crate::{ increment_bytes_scanned_in_object_store_calls_by_date, increment_files_scanned_in_object_store_calls_by_date, increment_object_store_calls_by_date, + increment_partial_file_scans_in_object_store_calls_by_date, }, parseable::{DEFAULT_TENANT, LogStream, PARSEABLE}, }; @@ -331,6 +340,46 @@ pub struct S3 { } impl S3 { + async fn _stream_to_file( + &self, + path: &RelativePath, + tenant_id: &Option, + write_path: PathBuf, + ) -> Result<(), ObjectStorageError> { + let tenant_str = tenant_id.as_deref().unwrap_or(DEFAULT_TENANT); + let date = Utc::now().date_naive().to_string(); + let resp = self.client.get(&to_object_store_path(path)).await; + increment_object_store_calls_by_date("GET", &date, tenant_str); + let resp = resp?; + + if let Some(parent) = write_path.parent() { + tokio::fs::create_dir_all(parent).await?; + } + let file = tokio::fs::File::create(&write_path).await?; + let mut writer = BufWriter::new(file); + let mut stream = resp.into_stream(); + let mut total: u64 = 0; + let mut chunk_count: u64 = 0; + while let Some(chunk) = stream.next().await { + let chunk = chunk?; + total += chunk.len() as u64; + chunk_count += 1; + writer.write_all(&chunk).await?; + } + writer.flush().await?; + writer.into_inner().sync_all().await?; + + increment_files_scanned_in_object_store_calls_by_date("GET", 1, &date, tenant_str); + increment_bytes_scanned_in_object_store_calls_by_date("GET", total, &date, tenant_str); + increment_partial_file_scans_in_object_store_calls_by_date( + "GET", + chunk_count, + &date, + tenant_str, + ); + Ok(()) + } + async fn _get_object( &self, path: &RelativePath, @@ -686,12 +735,21 @@ impl ObjectStorage for S3 { Ok(result?) } + async fn buffered_write( + &self, + path: &RelativePath, + tenant_id: &Option, + write_path: PathBuf, + ) -> Result<(), ObjectStorageError> { + self._stream_to_file(path, tenant_id, write_path).await + } + async fn get_object( &self, path: &RelativePath, tenant_id: &Option, ) -> Result { - Ok(self._get_object(path, tenant_id).await?) + self._get_object(path, tenant_id).await } async fn get_objects( From b26b90bd235036c2c00c13254c899b74749fc576 Mon Sep 17 00:00:00 2001 From: Anant Vindal Date: Tue, 5 May 2026 14:36:54 +0530 Subject: [PATCH 02/18] replace streaming with parallel downloads --- src/storage/azure_blob.rs | 63 +++++++++++++++++++++++++++----------- src/storage/gcs.rs | 64 ++++++++++++++++++++++++++++----------- src/storage/s3.rs | 63 +++++++++++++++++++++++++++----------- 3 files changed, 136 insertions(+), 54 deletions(-) diff --git a/src/storage/azure_blob.rs b/src/storage/azure_blob.rs index 78271b29e..2cb96f974 100644 --- a/src/storage/azure_blob.rs +++ b/src/storage/azure_blob.rs @@ -18,6 +18,7 @@ use std::{ collections::HashSet, + ops::Range, path::{Path, PathBuf}, sync::Arc, time::Duration, @@ -45,11 +46,15 @@ use object_store::{ use relative_path::{RelativePath, RelativePathBuf}; use tokio::{ fs::OpenOptions, - io::{AsyncReadExt, AsyncWriteExt, BufWriter}, + io::{AsyncReadExt, AsyncSeekExt, AsyncWriteExt}, + sync::Mutex as AsyncMutex, }; use tracing::error; use url::Url; +const PARALLEL_DOWNLOAD_CHUNK_SIZE: u64 = 8 * 1024 * 1024; +const PARALLEL_DOWNLOAD_CONCURRENCY: usize = 8; + use crate::{ metrics::{ increment_bytes_scanned_in_object_store_calls_by_date, @@ -213,7 +218,7 @@ pub struct BlobStore { } impl BlobStore { - async fn _stream_to_file( + async fn _parallel_download( &self, path: &RelativePath, tenant_id: &Option, @@ -221,27 +226,49 @@ impl BlobStore { ) -> Result<(), ObjectStorageError> { let tenant_str = tenant_id.as_deref().unwrap_or(DEFAULT_TENANT); let date = Utc::now().date_naive().to_string(); - let resp = self.client.get(&to_object_store_path(path)).await; - increment_object_store_calls_by_date("GET", &date, tenant_str); - let resp = resp?; + let src = to_object_store_path(path); + + let meta = self.client.head(&src).await?; + increment_object_store_calls_by_date("HEAD", &date, tenant_str); + let total = meta.size; if let Some(parent) = write_path.parent() { tokio::fs::create_dir_all(parent).await?; } let file = tokio::fs::File::create(&write_path).await?; - let mut writer = BufWriter::new(file); - let mut stream = resp.into_stream(); - let mut total: u64 = 0; - let mut chunk_count: u64 = 0; - while let Some(chunk) = stream.next().await { - let chunk = chunk?; - total += chunk.len() as u64; - chunk_count += 1; - writer.write_all(&chunk).await?; - } - writer.flush().await?; - writer.into_inner().sync_all().await?; + file.set_len(total).await?; + let file = Arc::new(AsyncMutex::new(file)); + let chunk = PARALLEL_DOWNLOAD_CHUNK_SIZE; + let ranges: Vec> = (0..total) + .step_by(chunk as usize) + .map(|s| s..(s + chunk).min(total)) + .collect(); + let chunk_count = ranges.len() as u64; + + futures::stream::iter(ranges) + .map(|r| { + let src = src.clone(); + let file = file.clone(); + let client = &self.client; + async move { + let bytes = client.get_range(&src, r.clone()).await?; + let mut f = file.lock().await; + f.seek(std::io::SeekFrom::Start(r.start)).await?; + f.write_all(&bytes).await?; + Ok::<_, ObjectStorageError>(()) + } + }) + .buffer_unordered(PARALLEL_DOWNLOAD_CONCURRENCY) + .try_collect::>() + .await?; + + let file = Arc::try_unwrap(file) + .map_err(|_| ObjectStorageError::Custom("download file arc still shared".into()))? + .into_inner(); + file.sync_all().await?; + + increment_object_store_calls_by_date("GET", &date, tenant_str); increment_files_scanned_in_object_store_calls_by_date("GET", 1, &date, tenant_str); increment_bytes_scanned_in_object_store_calls_by_date("GET", total, &date, tenant_str); increment_partial_file_scans_in_object_store_calls_by_date( @@ -521,7 +548,7 @@ impl ObjectStorage for BlobStore { tenant_id: &Option, write_path: PathBuf, ) -> Result<(), ObjectStorageError> { - self._stream_to_file(path, tenant_id, write_path).await + self._parallel_download(path, tenant_id, write_path).await } async fn get_buffered_reader( &self, diff --git a/src/storage/gcs.rs b/src/storage/gcs.rs index 314649116..f3ccfbd07 100644 --- a/src/storage/gcs.rs +++ b/src/storage/gcs.rs @@ -18,6 +18,7 @@ use std::{ collections::HashSet, + ops::Range, path::{Path, PathBuf}, sync::Arc, time::Duration, @@ -54,8 +55,12 @@ use object_store::{ use relative_path::{RelativePath, RelativePathBuf}; use tokio::{ fs::OpenOptions, - io::{AsyncReadExt, AsyncWriteExt, BufWriter}, + io::{AsyncReadExt, AsyncSeekExt, AsyncWriteExt}, + sync::Mutex as AsyncMutex, }; + +const PARALLEL_DOWNLOAD_CHUNK_SIZE: u64 = 8 * 1024 * 1024; +const PARALLEL_DOWNLOAD_CONCURRENCY: usize = 8; use tracing::error; use super::{ @@ -178,7 +183,7 @@ pub struct Gcs { } impl Gcs { - async fn _stream_to_file( + async fn _parallel_download( &self, path: &RelativePath, tenant_id: &Option, @@ -186,27 +191,50 @@ impl Gcs { ) -> Result<(), ObjectStorageError> { let tenant_str = tenant_id.as_deref().unwrap_or(DEFAULT_TENANT); let date = Utc::now().date_naive().to_string(); - let resp = self.client.get(&to_object_store_path(path)).await; - increment_object_store_calls_by_date("GET", &date, tenant_str); - let resp = resp?; + let src = to_object_store_path(path); + + let meta = self.client.head(&src).await?; + increment_object_store_calls_by_date("HEAD", &date, tenant_str); + let total = meta.size; if let Some(parent) = write_path.parent() { tokio::fs::create_dir_all(parent).await?; } let file = tokio::fs::File::create(&write_path).await?; - let mut writer = BufWriter::new(file); - let mut stream = resp.into_stream(); - let mut total: u64 = 0; - let mut chunk_count: u64 = 0; - while let Some(chunk) = stream.next().await { - let chunk = chunk?; - total += chunk.len() as u64; - chunk_count += 1; - writer.write_all(&chunk).await?; - } - writer.flush().await?; - writer.into_inner().sync_all().await?; + file.set_len(total).await?; + let file = Arc::new(AsyncMutex::new(file)); + let chunk = PARALLEL_DOWNLOAD_CHUNK_SIZE; + let ranges: Vec> = (0..total) + .step_by(chunk as usize) + .map(|s| s..(s + chunk).min(total)) + .collect(); + let chunk_count = ranges.len() as u64; + let client = self.client.clone(); + + futures::stream::iter(ranges) + .map(|r| { + let client = client.clone(); + let src = src.clone(); + let file = file.clone(); + async move { + let bytes = client.get_range(&src, r.clone()).await?; + let mut f = file.lock().await; + f.seek(std::io::SeekFrom::Start(r.start)).await?; + f.write_all(&bytes).await?; + Ok::<_, ObjectStorageError>(()) + } + }) + .buffer_unordered(PARALLEL_DOWNLOAD_CONCURRENCY) + .try_collect::>() + .await?; + + let file = Arc::try_unwrap(file) + .map_err(|_| ObjectStorageError::Custom("download file arc still shared".into()))? + .into_inner(); + file.sync_all().await?; + + increment_object_store_calls_by_date("GET", &date, tenant_str); increment_files_scanned_in_object_store_calls_by_date("GET", 1, &date, tenant_str); increment_bytes_scanned_in_object_store_calls_by_date("GET", total, &date, tenant_str); increment_partial_file_scans_in_object_store_calls_by_date( @@ -486,7 +514,7 @@ impl ObjectStorage for Gcs { tenant_id: &Option, write_path: PathBuf, ) -> Result<(), ObjectStorageError> { - self._stream_to_file(path, tenant_id, write_path).await + self._parallel_download(path, tenant_id, write_path).await } async fn get_buffered_reader( &self, diff --git a/src/storage/s3.rs b/src/storage/s3.rs index ba38b532f..c89a81191 100644 --- a/src/storage/s3.rs +++ b/src/storage/s3.rs @@ -19,6 +19,7 @@ use std::{ collections::HashSet, fmt::Display, + ops::Range, path::{Path, PathBuf}, str::FromStr, sync::Arc, @@ -47,7 +48,8 @@ use object_store::{ use relative_path::{RelativePath, RelativePathBuf}; use tokio::{ fs::OpenOptions, - io::{AsyncReadExt, AsyncWriteExt, BufWriter}, + io::{AsyncReadExt, AsyncSeekExt, AsyncWriteExt}, + sync::Mutex as AsyncMutex, }; use tracing::error; @@ -70,6 +72,8 @@ use super::{ // in bytes // const MULTIPART_UPLOAD_SIZE: usize = 1024 * 1024 * 100; const AWS_CONTAINER_CREDENTIALS_RELATIVE_URI: &str = "AWS_CONTAINER_CREDENTIALS_RELATIVE_URI"; +const PARALLEL_DOWNLOAD_CHUNK_SIZE: u64 = 8 * 1024 * 1024; +const PARALLEL_DOWNLOAD_CONCURRENCY: usize = 8; #[derive(Debug, Clone, clap::Args)] #[command( @@ -340,7 +344,7 @@ pub struct S3 { } impl S3 { - async fn _stream_to_file( + async fn _parallel_download( &self, path: &RelativePath, tenant_id: &Option, @@ -348,27 +352,50 @@ impl S3 { ) -> Result<(), ObjectStorageError> { let tenant_str = tenant_id.as_deref().unwrap_or(DEFAULT_TENANT); let date = Utc::now().date_naive().to_string(); - let resp = self.client.get(&to_object_store_path(path)).await; - increment_object_store_calls_by_date("GET", &date, tenant_str); - let resp = resp?; + let src = to_object_store_path(path); + + let meta = self.client.head(&src).await?; + increment_object_store_calls_by_date("HEAD", &date, tenant_str); + let total = meta.size; if let Some(parent) = write_path.parent() { tokio::fs::create_dir_all(parent).await?; } let file = tokio::fs::File::create(&write_path).await?; - let mut writer = BufWriter::new(file); - let mut stream = resp.into_stream(); - let mut total: u64 = 0; - let mut chunk_count: u64 = 0; - while let Some(chunk) = stream.next().await { - let chunk = chunk?; - total += chunk.len() as u64; - chunk_count += 1; - writer.write_all(&chunk).await?; - } - writer.flush().await?; - writer.into_inner().sync_all().await?; + file.set_len(total).await?; + let file = Arc::new(AsyncMutex::new(file)); + let chunk = PARALLEL_DOWNLOAD_CHUNK_SIZE; + let ranges: Vec> = (0..total) + .step_by(chunk as usize) + .map(|s| s..(s + chunk).min(total)) + .collect(); + let chunk_count = ranges.len() as u64; + let client = Arc::new(self.client.clone()); + + futures::stream::iter(ranges) + .map(|r| { + let client = client.clone(); + let src = src.clone(); + let file = file.clone(); + async move { + let bytes = client.get_range(&src, r.clone()).await?; + let mut f = file.lock().await; + f.seek(std::io::SeekFrom::Start(r.start)).await?; + f.write_all(&bytes).await?; + Ok::<_, ObjectStorageError>(()) + } + }) + .buffer_unordered(PARALLEL_DOWNLOAD_CONCURRENCY) + .try_collect::>() + .await?; + + let file = Arc::try_unwrap(file) + .map_err(|_| ObjectStorageError::Custom("download file arc still shared".into()))? + .into_inner(); + file.sync_all().await?; + + increment_object_store_calls_by_date("GET", &date, tenant_str); increment_files_scanned_in_object_store_calls_by_date("GET", 1, &date, tenant_str); increment_bytes_scanned_in_object_store_calls_by_date("GET", total, &date, tenant_str); increment_partial_file_scans_in_object_store_calls_by_date( @@ -741,7 +768,7 @@ impl ObjectStorage for S3 { tenant_id: &Option, write_path: PathBuf, ) -> Result<(), ObjectStorageError> { - self._stream_to_file(path, tenant_id, write_path).await + self._parallel_download(path, tenant_id, write_path).await } async fn get_object( From 92866c05d53d168d161b84c87eda05116f49129a Mon Sep 17 00:00:00 2001 From: Anant Vindal Date: Tue, 5 May 2026 15:25:04 +0530 Subject: [PATCH 03/18] new task per range --- src/storage/azure_blob.rs | 48 +++++++++++++++++++++++---------------- src/storage/gcs.rs | 43 ++++++++++++++++++++--------------- src/storage/s3.rs | 43 ++++++++++++++++++++--------------- 3 files changed, 78 insertions(+), 56 deletions(-) diff --git a/src/storage/azure_blob.rs b/src/storage/azure_blob.rs index 2cb96f974..9bc67fcd0 100644 --- a/src/storage/azure_blob.rs +++ b/src/storage/azure_blob.rs @@ -53,7 +53,7 @@ use tracing::error; use url::Url; const PARALLEL_DOWNLOAD_CHUNK_SIZE: u64 = 8 * 1024 * 1024; -const PARALLEL_DOWNLOAD_CONCURRENCY: usize = 8; +const PARALLEL_DOWNLOAD_CONCURRENCY: usize = 16; use crate::{ metrics::{ @@ -195,7 +195,7 @@ impl ObjectStorageProvider for AzureBlobConfig { // limit objectstore to a concurrent request limit let azure = LimitStore::new(azure, super::MAX_OBJECT_STORE_REQUESTS); Arc::new(BlobStore { - client: azure, + client: Arc::new(azure), account: self.account.clone(), container: self.container.clone(), root: StorePath::from(""), @@ -211,7 +211,7 @@ impl ObjectStorageProvider for AzureBlobConfig { // object store such as S3 and Azure Blob #[derive(Debug)] pub struct BlobStore { - client: LimitStore, + client: Arc>, account: String, container: String, root: StorePath, @@ -245,23 +245,31 @@ impl BlobStore { .map(|s| s..(s + chunk).min(total)) .collect(); let chunk_count = ranges.len() as u64; - - futures::stream::iter(ranges) - .map(|r| { - let src = src.clone(); - let file = file.clone(); - let client = &self.client; - async move { - let bytes = client.get_range(&src, r.clone()).await?; - let mut f = file.lock().await; - f.seek(std::io::SeekFrom::Start(r.start)).await?; - f.write_all(&bytes).await?; - Ok::<_, ObjectStorageError>(()) - } - }) - .buffer_unordered(PARALLEL_DOWNLOAD_CONCURRENCY) - .try_collect::>() - .await?; + let client = self.client.clone(); + let semaphore = Arc::new(tokio::sync::Semaphore::new(PARALLEL_DOWNLOAD_CONCURRENCY)); + + let mut handles = Vec::with_capacity(ranges.len()); + for r in ranges { + let client = client.clone(); + let src = src.clone(); + let file = file.clone(); + let semaphore = semaphore.clone(); + handles.push(tokio::spawn(async move { + let _permit = semaphore + .acquire_owned() + .await + .map_err(|e| ObjectStorageError::Custom(format!("semaphore closed: {e}")))?; + let bytes = client.get_range(&src, r.clone()).await?; + let mut f = file.lock().await; + f.seek(std::io::SeekFrom::Start(r.start)).await?; + f.write_all(&bytes).await?; + Ok::<_, ObjectStorageError>(()) + })); + } + for h in handles { + h.await + .map_err(|e| ObjectStorageError::Custom(format!("join error: {e}")))??; + } let file = Arc::try_unwrap(file) .map_err(|_| ObjectStorageError::Custom("download file arc still shared".into()))? diff --git a/src/storage/gcs.rs b/src/storage/gcs.rs index f3ccfbd07..32b741737 100644 --- a/src/storage/gcs.rs +++ b/src/storage/gcs.rs @@ -60,7 +60,7 @@ use tokio::{ }; const PARALLEL_DOWNLOAD_CHUNK_SIZE: u64 = 8 * 1024 * 1024; -const PARALLEL_DOWNLOAD_CONCURRENCY: usize = 8; +const PARALLEL_DOWNLOAD_CONCURRENCY: usize = 16; use tracing::error; use super::{ @@ -211,23 +211,30 @@ impl Gcs { .collect(); let chunk_count = ranges.len() as u64; let client = self.client.clone(); - - futures::stream::iter(ranges) - .map(|r| { - let client = client.clone(); - let src = src.clone(); - let file = file.clone(); - async move { - let bytes = client.get_range(&src, r.clone()).await?; - let mut f = file.lock().await; - f.seek(std::io::SeekFrom::Start(r.start)).await?; - f.write_all(&bytes).await?; - Ok::<_, ObjectStorageError>(()) - } - }) - .buffer_unordered(PARALLEL_DOWNLOAD_CONCURRENCY) - .try_collect::>() - .await?; + let semaphore = Arc::new(tokio::sync::Semaphore::new(PARALLEL_DOWNLOAD_CONCURRENCY)); + + let mut handles = Vec::with_capacity(ranges.len()); + for r in ranges { + let client = client.clone(); + let src = src.clone(); + let file = file.clone(); + let semaphore = semaphore.clone(); + handles.push(tokio::spawn(async move { + let _permit = semaphore + .acquire_owned() + .await + .map_err(|e| ObjectStorageError::Custom(format!("semaphore closed: {e}")))?; + let bytes = client.get_range(&src, r.clone()).await?; + let mut f = file.lock().await; + f.seek(std::io::SeekFrom::Start(r.start)).await?; + f.write_all(&bytes).await?; + Ok::<_, ObjectStorageError>(()) + })); + } + for h in handles { + h.await + .map_err(|e| ObjectStorageError::Custom(format!("join error: {e}")))??; + } let file = Arc::try_unwrap(file) .map_err(|_| ObjectStorageError::Custom("download file arc still shared".into()))? diff --git a/src/storage/s3.rs b/src/storage/s3.rs index c89a81191..11abd3cd5 100644 --- a/src/storage/s3.rs +++ b/src/storage/s3.rs @@ -73,7 +73,7 @@ use super::{ // const MULTIPART_UPLOAD_SIZE: usize = 1024 * 1024 * 100; const AWS_CONTAINER_CREDENTIALS_RELATIVE_URI: &str = "AWS_CONTAINER_CREDENTIALS_RELATIVE_URI"; const PARALLEL_DOWNLOAD_CHUNK_SIZE: u64 = 8 * 1024 * 1024; -const PARALLEL_DOWNLOAD_CONCURRENCY: usize = 8; +const PARALLEL_DOWNLOAD_CONCURRENCY: usize = 16; #[derive(Debug, Clone, clap::Args)] #[command( @@ -372,23 +372,30 @@ impl S3 { .collect(); let chunk_count = ranges.len() as u64; let client = Arc::new(self.client.clone()); - - futures::stream::iter(ranges) - .map(|r| { - let client = client.clone(); - let src = src.clone(); - let file = file.clone(); - async move { - let bytes = client.get_range(&src, r.clone()).await?; - let mut f = file.lock().await; - f.seek(std::io::SeekFrom::Start(r.start)).await?; - f.write_all(&bytes).await?; - Ok::<_, ObjectStorageError>(()) - } - }) - .buffer_unordered(PARALLEL_DOWNLOAD_CONCURRENCY) - .try_collect::>() - .await?; + let semaphore = Arc::new(tokio::sync::Semaphore::new(PARALLEL_DOWNLOAD_CONCURRENCY)); + + let mut handles = Vec::with_capacity(ranges.len()); + for r in ranges { + let client = client.clone(); + let src = src.clone(); + let file = file.clone(); + let semaphore = semaphore.clone(); + handles.push(tokio::spawn(async move { + let _permit = semaphore + .acquire_owned() + .await + .map_err(|e| ObjectStorageError::Custom(format!("semaphore closed: {e}")))?; + let bytes = client.get_range(&src, r.clone()).await?; + let mut f = file.lock().await; + f.seek(std::io::SeekFrom::Start(r.start)).await?; + f.write_all(&bytes).await?; + Ok::<_, ObjectStorageError>(()) + })); + } + for h in handles { + h.await + .map_err(|e| ObjectStorageError::Custom(format!("join error: {e}")))??; + } let file = Arc::try_unwrap(file) .map_err(|_| ObjectStorageError::Custom("download file arc still shared".into()))? From 881d6fe4baaba3b1cd092000fea5b9fde6729182 Mon Sep 17 00:00:00 2001 From: Anant Vindal Date: Tue, 5 May 2026 17:11:31 +0530 Subject: [PATCH 04/18] expose new env vars --- src/cli.rs | 16 ++++++++++++++++ src/storage/azure_blob.rs | 11 ++++++----- src/storage/gcs.rs | 10 ++++++---- src/storage/s3.rs | 10 ++++++---- 4 files changed, 34 insertions(+), 13 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index 969c3a619..49aea0be5 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -314,6 +314,22 @@ pub struct Options { )] pub hot_tier_storage_path: Option, + #[arg( + long = "hot-tier-download-chunk-size", + env = "P_HOT_TIER_DOWNLOAD_CHUNK_SIZE", + default_value = "8388608", + help = "Chunk size in bytes for parallel hot tier downloads (default 8 MiB)" + )] + pub hot_tier_download_chunk_size: u64, + + #[arg( + long = "hot-tier-download-concurrency", + env = "P_HOT_TIER_DOWNLOAD_CONCURRENCY", + default_value = "16", + help = "Number of concurrent range requests per hot tier download" + )] + pub hot_tier_download_concurrency: usize, + //TODO: remove this when smart cache is implemented #[arg( long = "index-storage-path", diff --git a/src/storage/azure_blob.rs b/src/storage/azure_blob.rs index 9bc67fcd0..46d610573 100644 --- a/src/storage/azure_blob.rs +++ b/src/storage/azure_blob.rs @@ -52,9 +52,6 @@ use tokio::{ use tracing::error; use url::Url; -const PARALLEL_DOWNLOAD_CHUNK_SIZE: u64 = 8 * 1024 * 1024; -const PARALLEL_DOWNLOAD_CONCURRENCY: usize = 16; - use crate::{ metrics::{ increment_bytes_scanned_in_object_store_calls_by_date, @@ -239,14 +236,18 @@ impl BlobStore { file.set_len(total).await?; let file = Arc::new(AsyncMutex::new(file)); - let chunk = PARALLEL_DOWNLOAD_CHUNK_SIZE; + let chunk = PARSEABLE + .options + .hot_tier_download_chunk_size + .max(8 * 1024 * 1024); + let concurrency = PARSEABLE.options.hot_tier_download_concurrency.max(6); let ranges: Vec> = (0..total) .step_by(chunk as usize) .map(|s| s..(s + chunk).min(total)) .collect(); let chunk_count = ranges.len() as u64; let client = self.client.clone(); - let semaphore = Arc::new(tokio::sync::Semaphore::new(PARALLEL_DOWNLOAD_CONCURRENCY)); + let semaphore = Arc::new(tokio::sync::Semaphore::new(concurrency)); let mut handles = Vec::with_capacity(ranges.len()); for r in ranges { diff --git a/src/storage/gcs.rs b/src/storage/gcs.rs index 32b741737..561c687f4 100644 --- a/src/storage/gcs.rs +++ b/src/storage/gcs.rs @@ -59,8 +59,6 @@ use tokio::{ sync::Mutex as AsyncMutex, }; -const PARALLEL_DOWNLOAD_CHUNK_SIZE: u64 = 8 * 1024 * 1024; -const PARALLEL_DOWNLOAD_CONCURRENCY: usize = 16; use tracing::error; use super::{ @@ -204,14 +202,18 @@ impl Gcs { file.set_len(total).await?; let file = Arc::new(AsyncMutex::new(file)); - let chunk = PARALLEL_DOWNLOAD_CHUNK_SIZE; + let chunk = PARSEABLE + .options + .hot_tier_download_chunk_size + .max(8 * 1024 * 1024); + let concurrency = PARSEABLE.options.hot_tier_download_concurrency.max(16); let ranges: Vec> = (0..total) .step_by(chunk as usize) .map(|s| s..(s + chunk).min(total)) .collect(); let chunk_count = ranges.len() as u64; let client = self.client.clone(); - let semaphore = Arc::new(tokio::sync::Semaphore::new(PARALLEL_DOWNLOAD_CONCURRENCY)); + let semaphore = Arc::new(tokio::sync::Semaphore::new(concurrency)); let mut handles = Vec::with_capacity(ranges.len()); for r in ranges { diff --git a/src/storage/s3.rs b/src/storage/s3.rs index 11abd3cd5..f1bf5a3ba 100644 --- a/src/storage/s3.rs +++ b/src/storage/s3.rs @@ -72,8 +72,6 @@ use super::{ // in bytes // const MULTIPART_UPLOAD_SIZE: usize = 1024 * 1024 * 100; const AWS_CONTAINER_CREDENTIALS_RELATIVE_URI: &str = "AWS_CONTAINER_CREDENTIALS_RELATIVE_URI"; -const PARALLEL_DOWNLOAD_CHUNK_SIZE: u64 = 8 * 1024 * 1024; -const PARALLEL_DOWNLOAD_CONCURRENCY: usize = 16; #[derive(Debug, Clone, clap::Args)] #[command( @@ -365,14 +363,18 @@ impl S3 { file.set_len(total).await?; let file = Arc::new(AsyncMutex::new(file)); - let chunk = PARALLEL_DOWNLOAD_CHUNK_SIZE; + let chunk = PARSEABLE + .options + .hot_tier_download_chunk_size + .max(8 * 1024 * 1024); + let concurrency = PARSEABLE.options.hot_tier_download_concurrency.max(16); let ranges: Vec> = (0..total) .step_by(chunk as usize) .map(|s| s..(s + chunk).min(total)) .collect(); let chunk_count = ranges.len() as u64; let client = Arc::new(self.client.clone()); - let semaphore = Arc::new(tokio::sync::Semaphore::new(PARALLEL_DOWNLOAD_CONCURRENCY)); + let semaphore = Arc::new(tokio::sync::Semaphore::new(concurrency)); let mut handles = Vec::with_capacity(ranges.len()); for r in ranges { From aec06d3e7f6d8ba9215a952ac71ffc032917011d Mon Sep 17 00:00:00 2001 From: Anant Vindal Date: Tue, 5 May 2026 17:35:54 +0530 Subject: [PATCH 05/18] parallel file download per stream --- src/cli.rs | 8 ++ src/hottier.rs | 268 ++++++++++++++++++++++++++++++++----------------- 2 files changed, 186 insertions(+), 90 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index 49aea0be5..1eaf617c7 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -330,6 +330,14 @@ pub struct Options { )] pub hot_tier_download_concurrency: usize, + #[arg( + long = "hot-tier-files-per-stream-concurrency", + env = "P_HOT_TIER_FILES_PER_STREAM_CONCURRENCY", + default_value = "4", + help = "Number of concurrent parquet file downloads per stream during hot tier sync" + )] + pub hot_tier_files_per_stream_concurrency: usize, + //TODO: remove this when smart cache is implemented #[arg( long = "index-storage-path", diff --git a/src/hottier.rs b/src/hottier.rs index c6172d966..887f0c4d7 100644 --- a/src/hottier.rs +++ b/src/hottier.rs @@ -17,10 +17,12 @@ */ use std::{ - collections::BTreeMap, + collections::{BTreeMap, HashMap}, io, path::{Path, PathBuf}, + sync::Arc, }; +use tokio::sync::{Mutex as AsyncMutex, RwLock as AsyncRwLock}; use crate::{ catalog::manifest::{File, Manifest}, @@ -64,9 +66,18 @@ pub struct StreamHotTier { pub oldest_date_time_entry: Option, } +/// Per-stream in-memory bookkeeping. Mutex protects concurrent reservation, +/// commit, and per-date manifest writes. Downloads run outside the lock. +struct StreamSyncState { + sht: AsyncMutex, +} + +type StreamKey = (Option, String); + pub struct HotTierManager { filesystem: LocalFileSystem, hot_tier_path: &'static Path, + state_cache: AsyncRwLock>>, } impl HotTierManager { @@ -75,9 +86,39 @@ impl HotTierManager { HotTierManager { filesystem: LocalFileSystem::new(), hot_tier_path, + state_cache: AsyncRwLock::new(HashMap::new()), } } + /// Lazy-load and cache the `StreamHotTier` for a (tenant, stream) pair. + /// All sync-path mutations should acquire `state.sht.lock()`. + async fn get_or_load_state( + &self, + stream: &str, + tenant_id: &Option, + ) -> Result, HotTierError> { + let key: StreamKey = (tenant_id.clone(), stream.to_owned()); + if let Some(state) = self.state_cache.read().await.get(&key).cloned() { + return Ok(state); + } + let mut cache = self.state_cache.write().await; + if let Some(state) = cache.get(&key).cloned() { + return Ok(state); + } + let sht = self.get_hot_tier(stream, tenant_id).await?; + let state = Arc::new(StreamSyncState { + sht: AsyncMutex::new(sht), + }); + cache.insert(key, state.clone()); + Ok(state) + } + + /// Drop cached state for a stream (used after delete). + pub async fn invalidate_state(&self, stream: &str, tenant_id: &Option) { + let key: StreamKey = (tenant_id.clone(), stream.to_owned()); + self.state_cache.write().await.remove(&key); + } + /// Get a global pub fn global() -> Option<&'static HotTierManager> { static INSTANCE: OnceCell = OnceCell::new(); @@ -219,6 +260,7 @@ impl HotTierManager { self.hot_tier_path.join(stream) }; fs::remove_dir_all(path).await?; + self.invalidate_state(stream, tenant_id).await; Ok(()) } @@ -319,9 +361,6 @@ impl HotTierManager { stream: String, tenant_id: Option, ) -> Result<(), HotTierError> { - let stream_hot_tier = self.get_hot_tier(&stream, &tenant_id).await?; - let mut parquet_file_size = stream_hot_tier.used_size; - let mut s3_manifest_file_list = PARSEABLE .metastore .get_all_manifest_files(&stream, &tenant_id) @@ -332,13 +371,8 @@ impl HotTierManager { ))) })?; - self.process_manifest( - &stream, - &mut s3_manifest_file_list, - &mut parquet_file_size, - &tenant_id, - ) - .await?; + self.process_manifest(&stream, &mut s3_manifest_file_list, &tenant_id) + .await?; Ok(()) } @@ -351,118 +385,172 @@ impl HotTierManager { &self, stream: &str, manifest_files_to_download: &mut BTreeMap>, - parquet_file_size: &mut u64, tenant_id: &Option, ) -> Result<(), HotTierError> { if manifest_files_to_download.is_empty() { return Ok(()); } + + // Build flat work list: (date, file, local_path) ordered latest-date-first, + // and within a date by descending file_path (matches old `pop()` order). + let mut work: Vec<(NaiveDate, File, PathBuf)> = Vec::new(); for (str_date, manifest_files) in manifest_files_to_download.iter().rev() { - let mut storage_combined_manifest = Manifest::default(); + let date = + match NaiveDate::parse_from_str(str_date.trim_start_matches("date="), "%Y-%m-%d") { + Ok(d) => d, + Err(_) => { + warn!("Invalid date format: {}", str_date); + continue; + } + }; + let mut combined = Manifest::default(); for storage_manifest in manifest_files { - storage_combined_manifest - .files - .extend(storage_manifest.files.clone()); + combined.files.extend(storage_manifest.files.clone()); } + combined.files.sort_by_key(|f| f.file_path.clone()); + while let Some(parquet_file) = combined.files.pop() { + let parquet_path = self.hot_tier_path.join(&parquet_file.file_path); + if !parquet_path.exists() { + work.push((date, parquet_file, parquet_path)); + } + } + } - storage_combined_manifest - .files - .sort_by_key(|file| file.file_path.clone()); - - while let Some(parquet_file) = storage_combined_manifest.files.pop() { - let parquet_file_path = &parquet_file.file_path; - let parquet_path = self.hot_tier_path.join(parquet_file_path); + if work.is_empty() { + return Ok(()); + } - if !parquet_path.exists() { - if let Ok(date) = - NaiveDate::parse_from_str(str_date.trim_start_matches("date="), "%Y-%m-%d") - { - if !self - .process_parquet_file( - stream, - &parquet_file, - parquet_file_size, - parquet_path, - date, - tenant_id, - ) - .await? - { - break; - } - } else { - warn!("Invalid date format: {}", str_date); + let state = self.get_or_load_state(stream, tenant_id).await?; + let concurrency = PARSEABLE + .options + .hot_tier_files_per_stream_concurrency + .max(4); + + // Reservation failure (out of disk + nothing to evict) is sticky: + // once one file can't be placed, no subsequent file will fit either. + let stop = Arc::new(std::sync::atomic::AtomicBool::new(false)); + + let stream_owned = stream.to_owned(); + let tenant_owned = tenant_id.clone(); + + let results: Vec> = futures::stream::iter(work) + .map(|(date, file, parquet_path)| { + let state = state.clone(); + let stream = stream_owned.clone(); + let tenant_id = tenant_owned.clone(); + let stop = stop.clone(); + async move { + if stop.load(std::sync::atomic::Ordering::Relaxed) { + return Ok(()); + } + let processed = self + .process_parquet_file_concurrent( + &stream, + &file, + parquet_path, + date, + &tenant_id, + &state, + ) + .await?; + if !processed { + stop.store(true, std::sync::atomic::Ordering::Relaxed); } + Ok(()) } - } + }) + .buffer_unordered(concurrency) + .collect() + .await; + + for r in results { + r?; } Ok(()) } - /// process the parquet file for the stream - /// check if the disk is available to download the parquet file - /// if not available, delete the oldest entry from the hot tier directory - /// download the parquet file from S3 to the hot tier directory - /// update the used and available size in the hot tier metadata - /// return true if the parquet file is processed successfully - async fn process_parquet_file( + /// Reserve disk budget under the per-stream lock, download outside the lock, + /// then commit usage + per-date manifest under the lock again. + /// Returns false when no budget is available (caller should stop scheduling + /// further work for this stream). + async fn process_parquet_file_concurrent( &self, stream: &str, parquet_file: &File, - parquet_file_size: &mut u64, parquet_path: PathBuf, date: NaiveDate, tenant_id: &Option, + state: &Arc, ) -> Result { - let mut file_processed = false; - let mut stream_hot_tier = self.get_hot_tier(stream, tenant_id).await?; - if !self.is_disk_available(parquet_file.file_size).await? - || stream_hot_tier.available_size <= parquet_file.file_size + // RESERVE { - if !self - .cleanup_hot_tier_old_data( - stream, - &mut stream_hot_tier, - &parquet_path, - parquet_file.file_size, - tenant_id, - ) - .await? - { - return Ok(file_processed); + let mut sht = state.sht.lock().await; + if (!self.is_disk_available(parquet_file.file_size).await? + || sht.available_size <= parquet_file.file_size) + && !self + .cleanup_hot_tier_old_data( + stream, + &mut sht, + &parquet_path, + parquet_file.file_size, + tenant_id, + ) + .await? + { + return Ok(false); + } + if sht.available_size <= parquet_file.file_size { + return Ok(false); } - *parquet_file_size = stream_hot_tier.used_size; + sht.available_size -= parquet_file.file_size; + self.put_hot_tier(stream, &mut sht, tenant_id).await?; } + + // DOWNLOAD (no lock held) let parquet_file_path = RelativePathBuf::from(parquet_file.file_path.clone()); fs::create_dir_all(parquet_path.parent().unwrap()).await?; - PARSEABLE + let download_result = PARSEABLE .storage .get_object_store() - .buffered_write(&parquet_file_path, tenant_id, parquet_path) - .await?; - *parquet_file_size += parquet_file.file_size; - stream_hot_tier.used_size = *parquet_file_size; + .buffered_write(&parquet_file_path, tenant_id, parquet_path.clone()) + .await; + + if let Err(e) = download_result { + // refund reservation + let mut sht = state.sht.lock().await; + sht.available_size += parquet_file.file_size; + if let Err(put_err) = self.put_hot_tier(stream, &mut sht, tenant_id).await { + error!("failed to persist refund after download failure: {put_err:?}"); + } + // best-effort cleanup of any partial file + let _ = fs::remove_file(&parquet_path).await; + return Err(e.into()); + } - stream_hot_tier.available_size -= parquet_file.file_size; - self.put_hot_tier(stream, &mut stream_hot_tier, tenant_id) - .await?; - file_processed = true; - let path = self.get_stream_path_for_date(stream, &date, tenant_id); - let mut hot_tier_manifest = HotTierManager::get_hot_tier_manifest_from_path(path).await?; - hot_tier_manifest.files.push(parquet_file.clone()); - hot_tier_manifest - .files - .sort_by_key(|file| file.file_path.clone()); - // write the manifest file to the hot tier directory - let manifest_path = self - .get_stream_path_for_date(stream, &date, tenant_id) - .join("hottier.manifest.json"); - fs::create_dir_all(manifest_path.parent().unwrap()).await?; - fs::write(manifest_path, serde_json::to_vec(&hot_tier_manifest)?).await?; - - Ok(file_processed) + // COMMIT + { + let mut sht = state.sht.lock().await; + sht.used_size += parquet_file.file_size; + self.put_hot_tier(stream, &mut sht, tenant_id).await?; + + let path = self.get_stream_path_for_date(stream, &date, tenant_id); + let mut hot_tier_manifest = + HotTierManager::get_hot_tier_manifest_from_path(path).await?; + hot_tier_manifest.files.push(parquet_file.clone()); + hot_tier_manifest + .files + .sort_by_key(|file| file.file_path.clone()); + // write the manifest file to the hot tier directory + let manifest_path = self + .get_stream_path_for_date(stream, &date, tenant_id) + .join("hottier.manifest.json"); + fs::create_dir_all(manifest_path.parent().unwrap()).await?; + fs::write(manifest_path, serde_json::to_vec(&hot_tier_manifest)?).await?; + } + + Ok(true) } ///fetch the list of dates available in the hot tier directory for the stream and sort them From 2b786c7582785d0badd414c3bdbdac8c16ca689e Mon Sep 17 00:00:00 2001 From: Anant Vindal Date: Tue, 5 May 2026 18:34:06 +0530 Subject: [PATCH 06/18] concurrent writes instead of mutex --- src/hottier.rs | 6 +++--- src/storage/azure_blob.rs | 28 ++++++++++++++-------------- src/storage/gcs.rs | 28 ++++++++++++++-------------- src/storage/s3.rs | 28 ++++++++++++++-------------- 4 files changed, 45 insertions(+), 45 deletions(-) diff --git a/src/hottier.rs b/src/hottier.rs index 887f0c4d7..d176a66ca 100644 --- a/src/hottier.rs +++ b/src/hottier.rs @@ -498,9 +498,9 @@ impl HotTierManager { tenant_id, ) .await? - { - return Ok(false); - } + { + return Ok(false); + } if sht.available_size <= parquet_file.file_size { return Ok(false); } diff --git a/src/storage/azure_blob.rs b/src/storage/azure_blob.rs index 46d610573..5f680e9ee 100644 --- a/src/storage/azure_blob.rs +++ b/src/storage/azure_blob.rs @@ -44,11 +44,7 @@ use object_store::{ path::Path as StorePath, }; use relative_path::{RelativePath, RelativePathBuf}; -use tokio::{ - fs::OpenOptions, - io::{AsyncReadExt, AsyncSeekExt, AsyncWriteExt}, - sync::Mutex as AsyncMutex, -}; +use tokio::{fs::OpenOptions, io::AsyncReadExt}; use tracing::error; use url::Url; @@ -234,7 +230,7 @@ impl BlobStore { } let file = tokio::fs::File::create(&write_path).await?; file.set_len(total).await?; - let file = Arc::new(AsyncMutex::new(file)); + let std_file = Arc::new(file.into_std().await); let chunk = PARSEABLE .options @@ -253,7 +249,7 @@ impl BlobStore { for r in ranges { let client = client.clone(); let src = src.clone(); - let file = file.clone(); + let std_file = std_file.clone(); let semaphore = semaphore.clone(); handles.push(tokio::spawn(async move { let _permit = semaphore @@ -261,9 +257,13 @@ impl BlobStore { .await .map_err(|e| ObjectStorageError::Custom(format!("semaphore closed: {e}")))?; let bytes = client.get_range(&src, r.clone()).await?; - let mut f = file.lock().await; - f.seek(std::io::SeekFrom::Start(r.start)).await?; - f.write_all(&bytes).await?; + let offset = r.start; + tokio::task::spawn_blocking(move || -> std::io::Result<()> { + use std::os::unix::fs::FileExt; + std_file.write_all_at(&bytes, offset) + }) + .await + .map_err(|e| ObjectStorageError::Custom(format!("join: {e}")))??; Ok::<_, ObjectStorageError>(()) })); } @@ -272,10 +272,10 @@ impl BlobStore { .map_err(|e| ObjectStorageError::Custom(format!("join error: {e}")))??; } - let file = Arc::try_unwrap(file) - .map_err(|_| ObjectStorageError::Custom("download file arc still shared".into()))? - .into_inner(); - file.sync_all().await?; + let std_file_sync = std_file.clone(); + tokio::task::spawn_blocking(move || std_file_sync.sync_all()) + .await + .map_err(|e| ObjectStorageError::Custom(format!("join: {e}")))??; increment_object_store_calls_by_date("GET", &date, tenant_str); increment_files_scanned_in_object_store_calls_by_date("GET", 1, &date, tenant_str); diff --git a/src/storage/gcs.rs b/src/storage/gcs.rs index 561c687f4..d40c80b4b 100644 --- a/src/storage/gcs.rs +++ b/src/storage/gcs.rs @@ -53,11 +53,7 @@ use object_store::{ path::Path as StorePath, }; use relative_path::{RelativePath, RelativePathBuf}; -use tokio::{ - fs::OpenOptions, - io::{AsyncReadExt, AsyncSeekExt, AsyncWriteExt}, - sync::Mutex as AsyncMutex, -}; +use tokio::{fs::OpenOptions, io::AsyncReadExt}; use tracing::error; @@ -200,7 +196,7 @@ impl Gcs { } let file = tokio::fs::File::create(&write_path).await?; file.set_len(total).await?; - let file = Arc::new(AsyncMutex::new(file)); + let std_file = Arc::new(file.into_std().await); let chunk = PARSEABLE .options @@ -219,7 +215,7 @@ impl Gcs { for r in ranges { let client = client.clone(); let src = src.clone(); - let file = file.clone(); + let std_file = std_file.clone(); let semaphore = semaphore.clone(); handles.push(tokio::spawn(async move { let _permit = semaphore @@ -227,9 +223,13 @@ impl Gcs { .await .map_err(|e| ObjectStorageError::Custom(format!("semaphore closed: {e}")))?; let bytes = client.get_range(&src, r.clone()).await?; - let mut f = file.lock().await; - f.seek(std::io::SeekFrom::Start(r.start)).await?; - f.write_all(&bytes).await?; + let offset = r.start; + tokio::task::spawn_blocking(move || -> std::io::Result<()> { + use std::os::unix::fs::FileExt; + std_file.write_all_at(&bytes, offset) + }) + .await + .map_err(|e| ObjectStorageError::Custom(format!("join: {e}")))??; Ok::<_, ObjectStorageError>(()) })); } @@ -238,10 +238,10 @@ impl Gcs { .map_err(|e| ObjectStorageError::Custom(format!("join error: {e}")))??; } - let file = Arc::try_unwrap(file) - .map_err(|_| ObjectStorageError::Custom("download file arc still shared".into()))? - .into_inner(); - file.sync_all().await?; + let std_file_sync = std_file.clone(); + tokio::task::spawn_blocking(move || std_file_sync.sync_all()) + .await + .map_err(|e| ObjectStorageError::Custom(format!("join: {e}")))??; increment_object_store_calls_by_date("GET", &date, tenant_str); increment_files_scanned_in_object_store_calls_by_date("GET", 1, &date, tenant_str); diff --git a/src/storage/s3.rs b/src/storage/s3.rs index f1bf5a3ba..4bc72d078 100644 --- a/src/storage/s3.rs +++ b/src/storage/s3.rs @@ -46,11 +46,7 @@ use object_store::{ path::Path as StorePath, }; use relative_path::{RelativePath, RelativePathBuf}; -use tokio::{ - fs::OpenOptions, - io::{AsyncReadExt, AsyncSeekExt, AsyncWriteExt}, - sync::Mutex as AsyncMutex, -}; +use tokio::{fs::OpenOptions, io::AsyncReadExt}; use tracing::error; use crate::{ @@ -361,7 +357,7 @@ impl S3 { } let file = tokio::fs::File::create(&write_path).await?; file.set_len(total).await?; - let file = Arc::new(AsyncMutex::new(file)); + let std_file = Arc::new(file.into_std().await); let chunk = PARSEABLE .options @@ -380,7 +376,7 @@ impl S3 { for r in ranges { let client = client.clone(); let src = src.clone(); - let file = file.clone(); + let std_file = std_file.clone(); let semaphore = semaphore.clone(); handles.push(tokio::spawn(async move { let _permit = semaphore @@ -388,9 +384,13 @@ impl S3 { .await .map_err(|e| ObjectStorageError::Custom(format!("semaphore closed: {e}")))?; let bytes = client.get_range(&src, r.clone()).await?; - let mut f = file.lock().await; - f.seek(std::io::SeekFrom::Start(r.start)).await?; - f.write_all(&bytes).await?; + let offset = r.start; + tokio::task::spawn_blocking(move || -> std::io::Result<()> { + use std::os::unix::fs::FileExt; + std_file.write_all_at(&bytes, offset) + }) + .await + .map_err(|e| ObjectStorageError::Custom(format!("join: {e}")))??; Ok::<_, ObjectStorageError>(()) })); } @@ -399,10 +399,10 @@ impl S3 { .map_err(|e| ObjectStorageError::Custom(format!("join error: {e}")))??; } - let file = Arc::try_unwrap(file) - .map_err(|_| ObjectStorageError::Custom("download file arc still shared".into()))? - .into_inner(); - file.sync_all().await?; + let std_file_sync = std_file.clone(); + tokio::task::spawn_blocking(move || std_file_sync.sync_all()) + .await + .map_err(|e| ObjectStorageError::Custom(format!("join: {e}")))??; increment_object_store_calls_by_date("GET", &date, tenant_str); increment_files_scanned_in_object_store_calls_by_date("GET", 1, &date, tenant_str); From 24b893fff0aa2ac49c118311f773299b94a77a6a Mon Sep 17 00:00:00 2001 From: Anant Vindal Date: Wed, 6 May 2026 17:09:09 +0530 Subject: [PATCH 07/18] crash safety by using .partial file --- src/hottier.rs | 97 +++++++++++++++++++++++++++++++++++++-- src/storage/azure_blob.rs | 29 ++++++++++-- src/storage/gcs.rs | 29 ++++++++++-- src/storage/localfs.rs | 15 ++++-- src/storage/mod.rs | 15 ++++++ src/storage/s3.rs | 29 ++++++++++-- 6 files changed, 199 insertions(+), 15 deletions(-) diff --git a/src/hottier.rs b/src/hottier.rs index d176a66ca..d10c7308c 100644 --- a/src/hottier.rs +++ b/src/hottier.rs @@ -105,7 +105,7 @@ impl HotTierManager { if let Some(state) = cache.get(&key).cloned() { return Ok(state); } - let sht = self.get_hot_tier(stream, tenant_id).await?; + let sht = self.reconcile_stream(stream, tenant_id).await?; let state = Arc::new(StreamSyncState { sht: AsyncMutex::new(sht), }); @@ -119,6 +119,98 @@ impl HotTierManager { self.state_cache.write().await.remove(&key); } + /// Walk the on-disk hot-tier directory for a stream and bring it into + /// agreement with `hottier.manifest.json` files. Removes `.partial` + /// orphans, drops manifest entries whose files are missing or wrong size, + /// deletes parquet files that exist but are not in their date manifest, + /// then recomputes `used_size` / `available_size` from the cleaned + /// manifests and persists the updated `StreamHotTier`. + async fn reconcile_stream( + &self, + stream: &str, + tenant_id: &Option, + ) -> Result { + let mut sht = self.get_hot_tier(stream, tenant_id).await?; + let dates = self.fetch_hot_tier_dates(stream, tenant_id).await?; + let mut total_used: u64 = 0; + + for date in dates { + let date_dir = self.get_stream_path_for_date(stream, &date, tenant_id); + if !date_dir.exists() { + continue; + } + + // Pass 1: collect on-disk parquet files (drop .partial orphans). + let mut on_disk: std::collections::HashSet = std::collections::HashSet::new(); + let mut entries = fs::read_dir(&date_dir).await?; + while let Some(entry) = entries.next_entry().await? { + let p = entry.path(); + let Some(name_os) = p.file_name() else { + continue; + }; + let name = name_os.to_string_lossy(); + if name.ends_with(".partial") { + let _ = fs::remove_file(&p).await; + continue; + } + if name.ends_with(".manifest.json") { + continue; + } + if !p.is_file() { + continue; + } + on_disk.insert(name.into_owned()); + } + + // Pass 2: clean manifest of stale entries. + let manifest_path = date_dir.join("hottier.manifest.json"); + let mut manifest: Manifest = if manifest_path.exists() { + let bytes = fs::read(&manifest_path).await?; + serde_json::from_slice(&bytes).unwrap_or_default() + } else { + Manifest::default() + }; + + let mut kept = Vec::with_capacity(manifest.files.len()); + let mut keep_names: std::collections::HashSet = + std::collections::HashSet::new(); + for f in manifest.files.drain(..) { + let local = self.hot_tier_path.join(&f.file_path); + let ok = match fs::metadata(&local).await { + Ok(m) => m.len() == f.file_size, + Err(_) => false, + }; + if ok { + if let Some(name) = local.file_name().and_then(|s| s.to_str()) { + keep_names.insert(name.to_owned()); + } + total_used += f.file_size; + kept.push(f); + } else { + let _ = fs::remove_file(&local).await; + } + } + kept.sort_by_key(|f| f.file_path.clone()); + manifest.files = kept; + + if manifest_path.exists() || !manifest.files.is_empty() { + fs::create_dir_all(&date_dir).await?; + fs::write(&manifest_path, serde_json::to_vec(&manifest)?).await?; + } + + // Pass 3: delete on-disk parquet files not referenced by the cleaned manifest. + for name in on_disk.difference(&keep_names) { + let p = date_dir.join(name); + let _ = fs::remove_file(&p).await; + } + } + + sht.used_size = total_used; + sht.available_size = sht.size.saturating_sub(total_used); + self.put_hot_tier(stream, &mut sht, tenant_id).await?; + Ok(sht) + } + /// Get a global pub fn global() -> Option<&'static HotTierManager> { static INSTANCE: OnceCell = OnceCell::new(); @@ -524,8 +616,7 @@ impl HotTierManager { if let Err(put_err) = self.put_hot_tier(stream, &mut sht, tenant_id).await { error!("failed to persist refund after download failure: {put_err:?}"); } - // best-effort cleanup of any partial file - let _ = fs::remove_file(&parquet_path).await; + // backend already cleaned up its `.partial` file; final path was never created. return Err(e.into()); } diff --git a/src/storage/azure_blob.rs b/src/storage/azure_blob.rs index 5f680e9ee..52a4668aa 100644 --- a/src/storage/azure_blob.rs +++ b/src/storage/azure_blob.rs @@ -61,7 +61,8 @@ use crate::{ use super::{ CONNECT_TIMEOUT_SECS, ObjectStorage, ObjectStorageError, ObjectStorageProvider, PARSEABLE_ROOT_DIRECTORY, REQUEST_TIMEOUT_SECS, STREAM_METADATA_FILE_NAME, - metrics_layer::MetricLayer, object_storage::parseable_json_path, to_object_store_path, + metrics_layer::MetricLayer, object_storage::parseable_json_path, partial_path, + to_object_store_path, }; #[derive(Debug, Clone, clap::Args)] @@ -216,6 +217,28 @@ impl BlobStore { path: &RelativePath, tenant_id: &Option, write_path: PathBuf, + ) -> Result<(), ObjectStorageError> { + let partial = partial_path(&write_path)?; + match self + ._parallel_download_inner(path, tenant_id, partial.clone()) + .await + { + Ok(()) => { + tokio::fs::rename(&partial, &write_path).await?; + Ok(()) + } + Err(e) => { + let _ = tokio::fs::remove_file(&partial).await; + Err(e) + } + } + } + + async fn _parallel_download_inner( + &self, + path: &RelativePath, + tenant_id: &Option, + partial_path: PathBuf, ) -> Result<(), ObjectStorageError> { let tenant_str = tenant_id.as_deref().unwrap_or(DEFAULT_TENANT); let date = Utc::now().date_naive().to_string(); @@ -225,10 +248,10 @@ impl BlobStore { increment_object_store_calls_by_date("HEAD", &date, tenant_str); let total = meta.size; - if let Some(parent) = write_path.parent() { + if let Some(parent) = partial_path.parent() { tokio::fs::create_dir_all(parent).await?; } - let file = tokio::fs::File::create(&write_path).await?; + let file = tokio::fs::File::create(&partial_path).await?; file.set_len(total).await?; let std_file = Arc::new(file.into_std().await); diff --git a/src/storage/gcs.rs b/src/storage/gcs.rs index d40c80b4b..75dd141b6 100644 --- a/src/storage/gcs.rs +++ b/src/storage/gcs.rs @@ -60,7 +60,8 @@ use tracing::error; use super::{ CONNECT_TIMEOUT_SECS, ObjectStorage, ObjectStorageError, ObjectStorageProvider, PARSEABLE_ROOT_DIRECTORY, REQUEST_TIMEOUT_SECS, STREAM_METADATA_FILE_NAME, - metrics_layer::MetricLayer, object_storage::parseable_json_path, to_object_store_path, + metrics_layer::MetricLayer, object_storage::parseable_json_path, partial_path, + to_object_store_path, }; #[derive(Debug, Clone, clap::Args)] @@ -182,6 +183,28 @@ impl Gcs { path: &RelativePath, tenant_id: &Option, write_path: PathBuf, + ) -> Result<(), ObjectStorageError> { + let partial = partial_path(&write_path)?; + match self + ._parallel_download_inner(path, tenant_id, partial.clone()) + .await + { + Ok(()) => { + tokio::fs::rename(&partial, &write_path).await?; + Ok(()) + } + Err(e) => { + let _ = tokio::fs::remove_file(&partial).await; + Err(e) + } + } + } + + async fn _parallel_download_inner( + &self, + path: &RelativePath, + tenant_id: &Option, + partial_path: PathBuf, ) -> Result<(), ObjectStorageError> { let tenant_str = tenant_id.as_deref().unwrap_or(DEFAULT_TENANT); let date = Utc::now().date_naive().to_string(); @@ -191,10 +214,10 @@ impl Gcs { increment_object_store_calls_by_date("HEAD", &date, tenant_str); let total = meta.size; - if let Some(parent) = write_path.parent() { + if let Some(parent) = partial_path.parent() { tokio::fs::create_dir_all(parent).await?; } - let file = tokio::fs::File::create(&write_path).await?; + let file = tokio::fs::File::create(&partial_path).await?; file.set_len(total).await?; let std_file = Arc::new(file.into_std().await); diff --git a/src/storage/localfs.rs b/src/storage/localfs.rs index c9a58502c..8d6027413 100644 --- a/src/storage/localfs.rs +++ b/src/storage/localfs.rs @@ -113,11 +113,20 @@ impl ObjectStorage for LocalFS { write_path: PathBuf, ) -> Result<(), ObjectStorageError> { let src = self.path_in_root(path); - if let Some(parent) = write_path.parent() { + let partial = super::partial_path(&write_path)?; + if let Some(parent) = partial.parent() { fs::create_dir_all(parent).await?; } - fs::copy(&src, &write_path).await?; - Ok(()) + match fs::copy(&src, &partial).await { + Ok(_) => { + fs::rename(&partial, &write_path).await?; + Ok(()) + } + Err(e) => { + let _ = fs::remove_file(&partial).await; + Err(e.into()) + } + } } async fn upload_multipart( &self, diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 2ca7a10de..15dd98975 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -344,3 +344,18 @@ pub enum ObjectStorageError { pub fn to_object_store_path(path: &RelativePath) -> Path { Path::from(path.as_str()) } + +/// Append `.partial` to the file name of a local path. Used by hot-tier +/// downloaders to write to a sibling path and atomically rename on success. +pub fn partial_path( + write_path: &std::path::Path, +) -> Result { + let name = write_path + .file_name() + .ok_or_else(|| ObjectStorageError::Custom("download write_path has no file name".into()))?; + let mut next = std::ffi::OsString::from(name); + next.push(".partial"); + let mut buf = write_path.to_path_buf(); + buf.set_file_name(next); + Ok(buf) +} diff --git a/src/storage/s3.rs b/src/storage/s3.rs index 4bc72d078..1ba95ef76 100644 --- a/src/storage/s3.rs +++ b/src/storage/s3.rs @@ -62,7 +62,8 @@ use crate::{ use super::{ CONNECT_TIMEOUT_SECS, ObjectStorage, ObjectStorageError, ObjectStorageProvider, PARSEABLE_ROOT_DIRECTORY, REQUEST_TIMEOUT_SECS, STREAM_METADATA_FILE_NAME, - metrics_layer::MetricLayer, object_storage::parseable_json_path, to_object_store_path, + metrics_layer::MetricLayer, object_storage::parseable_json_path, partial_path, + to_object_store_path, }; // in bytes @@ -343,6 +344,28 @@ impl S3 { path: &RelativePath, tenant_id: &Option, write_path: PathBuf, + ) -> Result<(), ObjectStorageError> { + let partial = partial_path(&write_path)?; + match self + ._parallel_download_inner(path, tenant_id, partial.clone()) + .await + { + Ok(()) => { + tokio::fs::rename(&partial, &write_path).await?; + Ok(()) + } + Err(e) => { + let _ = tokio::fs::remove_file(&partial).await; + Err(e) + } + } + } + + async fn _parallel_download_inner( + &self, + path: &RelativePath, + tenant_id: &Option, + partial_path: PathBuf, ) -> Result<(), ObjectStorageError> { let tenant_str = tenant_id.as_deref().unwrap_or(DEFAULT_TENANT); let date = Utc::now().date_naive().to_string(); @@ -352,10 +375,10 @@ impl S3 { increment_object_store_calls_by_date("HEAD", &date, tenant_str); let total = meta.size; - if let Some(parent) = write_path.parent() { + if let Some(parent) = partial_path.parent() { tokio::fs::create_dir_all(parent).await?; } - let file = tokio::fs::File::create(&write_path).await?; + let file = tokio::fs::File::create(&partial_path).await?; file.set_len(total).await?; let std_file = Arc::new(file.into_std().await); From e734cf455461eca2f13969f6e26a43d33833bf31 Mon Sep 17 00:00:00 2001 From: Anant Vindal Date: Wed, 6 May 2026 18:11:50 +0530 Subject: [PATCH 08/18] separate out historic and latest hottier tasks --- src/cli.rs | 16 +++++++++ src/hottier.rs | 92 +++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 89 insertions(+), 19 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index 1eaf617c7..6b5e9d7e4 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -338,6 +338,22 @@ pub struct Options { )] pub hot_tier_files_per_stream_concurrency: usize, + #[arg( + long = "hot-tier-latest-minutes", + env = "P_HOT_TIER_LATEST_MINUTES", + default_value = "10", + help = "Files whose timestamp is within the last N minutes are 'latest'; rest are 'historic'." + )] + pub hot_tier_latest_minutes: i64, + + #[arg( + long = "hot-tier-historic-sync-minutes", + env = "P_HOT_TIER_HISTORIC_SYNC_MINUTES", + default_value = "5", + help = "Interval (minutes) at which the historic hot-tier sync runs." + )] + pub hot_tier_historic_sync_minutes: u32, + //TODO: remove this when smart cache is implemented #[arg( long = "index-storage-path", diff --git a/src/hottier.rs b/src/hottier.rs index d10c7308c..72eacc750 100644 --- a/src/hottier.rs +++ b/src/hottier.rs @@ -72,6 +72,15 @@ struct StreamSyncState { sht: AsyncMutex, } +/// Hot-tier sync runs in two phases. Latest pulls files newer than +/// `hot_tier_latest_minutes` ago and may evict historic to make room. +/// Historic pulls older files, runs less often, never triggers eviction. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +enum SyncPhase { + Latest, + Historic, +} + type StreamKey = (Option, String); pub struct HotTierManager { @@ -392,7 +401,10 @@ impl HotTierManager { Ok(path) } - /// schedule the download of the hot tier files from S3 every minute + /// schedule two hot-tier sync jobs: a frequent "latest" pass for files + /// within the last `P_HOT_TIER_LATEST_MINUTES`, and a less-frequent + /// "historic" pass for older files. Both share the per-stream lock; only + /// the latest pass triggers eviction. pub fn download_from_s3<'a>(&'a self) -> Result<(), HotTierError> where 'a: 'static, @@ -402,8 +414,19 @@ impl HotTierManager { .every(HOT_TIER_SYNC_DURATION) .plus(Interval::Seconds(5)) .run(move || async { - if let Err(err) = self.sync_hot_tier().await { - error!("Error in hot tier scheduler: {:?}", err); + if let Err(err) = self.sync_hot_tier(SyncPhase::Latest).await { + error!("Error in hot tier latest scheduler: {:?}", err); + } + }); + + let historic_interval = + Interval::Minutes(PARSEABLE.options.hot_tier_historic_sync_minutes.max(1)); + scheduler + .every(historic_interval) + .plus(Interval::Seconds(15)) + .run(move || async { + if let Err(err) = self.sync_hot_tier(SyncPhase::Historic).await { + error!("Error in hot tier historic scheduler: {:?}", err); } }); @@ -417,7 +440,7 @@ impl HotTierManager { } /// sync the hot tier files from S3 to the hot tier directory for all streams - async fn sync_hot_tier(&self) -> Result<(), HotTierError> { + async fn sync_hot_tier(&self, phase: SyncPhase) -> Result<(), HotTierError> { // Before syncing, check if pstats stream was created and needs hot tier if let Err(e) = self.create_pstats_hot_tier().await { tracing::trace!("Skipping pstats hot tier creation because of error: {e}"); @@ -432,7 +455,11 @@ impl HotTierManager { for tenant_id in tenants { for stream in PARSEABLE.streams.list(&tenant_id) { if self.check_stream_hot_tier_exists(&stream, &tenant_id) { - sync_hot_tier_tasks.push(self.process_stream(stream, tenant_id.to_owned())); + sync_hot_tier_tasks.push(self.process_stream( + stream, + tenant_id.to_owned(), + phase, + )); } } } @@ -452,6 +479,7 @@ impl HotTierManager { &self, stream: String, tenant_id: Option, + phase: SyncPhase, ) -> Result<(), HotTierError> { let mut s3_manifest_file_list = PARSEABLE .metastore @@ -463,7 +491,7 @@ impl HotTierManager { ))) })?; - self.process_manifest(&stream, &mut s3_manifest_file_list, &tenant_id) + self.process_manifest(&stream, &mut s3_manifest_file_list, &tenant_id, phase) .await?; Ok(()) @@ -478,11 +506,15 @@ impl HotTierManager { stream: &str, manifest_files_to_download: &mut BTreeMap>, tenant_id: &Option, + phase: SyncPhase, ) -> Result<(), HotTierError> { if manifest_files_to_download.is_empty() { return Ok(()); } + let latest_minutes = PARSEABLE.options.hot_tier_latest_minutes.max(0); + let cutoff = chrono::Utc::now().naive_utc() - chrono::Duration::minutes(latest_minutes); + // Build flat work list: (date, file, local_path) ordered latest-date-first, // and within a date by descending file_path (matches old `pop()` order). let mut work: Vec<(NaiveDate, File, PathBuf)> = Vec::new(); @@ -503,7 +535,16 @@ impl HotTierManager { combined.files.sort_by_key(|f| f.file_path.clone()); while let Some(parquet_file) = combined.files.pop() { let parquet_path = self.hot_tier_path.join(&parquet_file.file_path); - if !parquet_path.exists() { + if parquet_path.exists() { + continue; + } + let dt = extract_datetime(&parquet_file.file_path); + let is_latest = dt.map(|d| d >= cutoff).unwrap_or(false); + let keep = match phase { + SyncPhase::Latest => is_latest, + SyncPhase::Historic => !is_latest, + }; + if keep { work.push((date, parquet_file, parquet_path)); } } @@ -544,6 +585,7 @@ impl HotTierManager { date, &tenant_id, &state, + phase, ) .await?; if !processed { @@ -567,6 +609,7 @@ impl HotTierManager { /// then commit usage + per-date manifest under the lock again. /// Returns false when no budget is available (caller should stop scheduling /// further work for this stream). + #[allow(clippy::too_many_arguments)] async fn process_parquet_file_concurrent( &self, stream: &str, @@ -575,23 +618,34 @@ impl HotTierManager { date: NaiveDate, tenant_id: &Option, state: &Arc, + phase: SyncPhase, ) -> Result { // RESERVE { let mut sht = state.sht.lock().await; - if (!self.is_disk_available(parquet_file.file_size).await? - || sht.available_size <= parquet_file.file_size) - && !self - .cleanup_hot_tier_old_data( - stream, - &mut sht, - &parquet_path, - parquet_file.file_size, - tenant_id, - ) - .await? + if !self.is_disk_available(parquet_file.file_size).await? + || sht.available_size <= parquet_file.file_size { - return Ok(false); + match phase { + SyncPhase::Latest => { + if !self + .cleanup_hot_tier_old_data( + stream, + &mut sht, + &parquet_path, + parquet_file.file_size, + tenant_id, + ) + .await? + { + return Ok(false); + } + } + SyncPhase::Historic => { + // Historic never evicts; just skip when full. + return Ok(false); + } + } } if sht.available_size <= parquet_file.file_size { return Ok(false); From 60b889470a27f4fa0c50b6df460f42ea136f3537 Mon Sep 17 00:00:00 2001 From: Anant Vindal Date: Wed, 6 May 2026 19:53:32 +0530 Subject: [PATCH 09/18] add logs --- src/hottier.rs | 212 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 205 insertions(+), 7 deletions(-) diff --git a/src/hottier.rs b/src/hottier.rs index 72eacc750..ef69799d2 100644 --- a/src/hottier.rs +++ b/src/hottier.rs @@ -139,9 +139,13 @@ impl HotTierManager { stream: &str, tenant_id: &Option, ) -> Result { + warn!(stream = %stream, tenant = ?tenant_id, "reconcile starting"); let mut sht = self.get_hot_tier(stream, tenant_id).await?; let dates = self.fetch_hot_tier_dates(stream, tenant_id).await?; let mut total_used: u64 = 0; + let mut partials_removed = 0usize; + let mut entries_dropped = 0usize; + let mut orphans_removed = 0usize; for date in dates { let date_dir = self.get_stream_path_for_date(stream, &date, tenant_id); @@ -160,6 +164,13 @@ impl HotTierManager { let name = name_os.to_string_lossy(); if name.ends_with(".partial") { let _ = fs::remove_file(&p).await; + partials_removed += 1; + warn!( + stream = %stream, + tenant = ?tenant_id, + path = %p.display(), + "reconcile: deleted partial orphan" + ); continue; } if name.ends_with(".manifest.json") { @@ -197,6 +208,13 @@ impl HotTierManager { kept.push(f); } else { let _ = fs::remove_file(&local).await; + entries_dropped += 1; + warn!( + stream = %stream, + tenant = ?tenant_id, + file = %f.file_path, + "reconcile: dropped manifest entry (file missing or wrong size)" + ); } } kept.sort_by_key(|f| f.file_path.clone()); @@ -211,12 +229,29 @@ impl HotTierManager { for name in on_disk.difference(&keep_names) { let p = date_dir.join(name); let _ = fs::remove_file(&p).await; + orphans_removed += 1; + warn!( + stream = %stream, + tenant = ?tenant_id, + file = %p.display(), + "reconcile: deleted orphan parquet not in manifest" + ); } } sht.used_size = total_used; sht.available_size = sht.size.saturating_sub(total_used); self.put_hot_tier(stream, &mut sht, tenant_id).await?; + warn!( + stream = %stream, + tenant = ?tenant_id, + partials_removed, + entries_dropped, + orphans_removed, + used = sht.used_size, + available = sht.available_size, + "reconcile done" + ); Ok(sht) } @@ -409,22 +444,30 @@ impl HotTierManager { where 'a: 'static, { + let latest_min = PARSEABLE.options.hot_tier_latest_minutes; + let historic_min = PARSEABLE.options.hot_tier_historic_sync_minutes.max(1); + warn!( + latest_minutes = latest_min, + historic_sync_minutes = historic_min, + "hot tier scheduler starting" + ); + let mut scheduler = AsyncScheduler::new(); scheduler .every(HOT_TIER_SYNC_DURATION) .plus(Interval::Seconds(5)) .run(move || async { + warn!(phase = ?SyncPhase::Latest, "hot tier tick fired"); if let Err(err) = self.sync_hot_tier(SyncPhase::Latest).await { error!("Error in hot tier latest scheduler: {:?}", err); } }); - let historic_interval = - Interval::Minutes(PARSEABLE.options.hot_tier_historic_sync_minutes.max(1)); scheduler - .every(historic_interval) + .every(Interval::Minutes(historic_min)) .plus(Interval::Seconds(15)) .run(move || async { + warn!(phase = ?SyncPhase::Historic, "hot tier tick fired"); if let Err(err) = self.sync_hot_tier(SyncPhase::Historic).await { error!("Error in hot tier historic scheduler: {:?}", err); } @@ -452,6 +495,7 @@ impl HotTierManager { } else { vec![None] }; + let mut scheduled = 0usize; for tenant_id in tenants { for stream in PARSEABLE.streams.list(&tenant_id) { if self.check_stream_hot_tier_exists(&stream, &tenant_id) { @@ -460,9 +504,11 @@ impl HotTierManager { tenant_id.to_owned(), phase, )); + scheduled += 1; } } } + warn!(phase = ?phase, num_streams = scheduled, "hot tier tick: streams scheduled"); while let Some(res) = sync_hot_tier_tasks.next().await { if let Err(err) = res { @@ -470,6 +516,7 @@ impl HotTierManager { return Err(err); } } + warn!(phase = ?phase, "hot tier tick complete"); Ok(()) } @@ -481,6 +528,12 @@ impl HotTierManager { tenant_id: Option, phase: SyncPhase, ) -> Result<(), HotTierError> { + warn!( + stream = %stream, + tenant = ?tenant_id, + phase = ?phase, + "fetching manifest from metastore" + ); let mut s3_manifest_file_list = PARSEABLE .metastore .get_all_manifest_files(&stream, &tenant_id) @@ -490,10 +543,24 @@ impl HotTierManager { e.to_detail(), ))) })?; + let date_count = s3_manifest_file_list.len(); + let total_files: usize = s3_manifest_file_list + .values() + .map(|m| m.iter().map(|x| x.files.len()).sum::()) + .sum(); + warn!( + stream = %stream, + tenant = ?tenant_id, + phase = ?phase, + dates = date_count, + files = total_files, + "manifest fetched" + ); self.process_manifest(&stream, &mut s3_manifest_file_list, &tenant_id, phase) .await?; + warn!(stream = %stream, tenant = ?tenant_id, phase = ?phase, "stream sync done"); Ok(()) } @@ -550,9 +617,25 @@ impl HotTierManager { } } + let work_count = work.len(); + let total_bytes: u64 = work.iter().map(|(_, f, _)| f.file_size).sum(); if work.is_empty() { + warn!( + stream = %stream, + tenant = ?tenant_id, + phase = ?phase, + "no files to download this tick" + ); return Ok(()); } + warn!( + stream = %stream, + tenant = ?tenant_id, + phase = ?phase, + work_count, + total_bytes, + "work list built" + ); let state = self.get_or_load_state(stream, tenant_id).await?; let concurrency = PARSEABLE @@ -588,9 +671,15 @@ impl HotTierManager { phase, ) .await?; - if !processed { - stop.store(true, std::sync::atomic::Ordering::Relaxed); - } + if !processed + && !stop.swap(true, std::sync::atomic::Ordering::Relaxed) { + warn!( + stream = %stream, + tenant = ?tenant_id, + phase = ?phase, + "sticky stop: halting further reservations this tick" + ); + } Ok(()) } }) @@ -623,11 +712,29 @@ impl HotTierManager { // RESERVE { let mut sht = state.sht.lock().await; + warn!( + stream = %stream, + tenant = ?tenant_id, + phase = ?phase, + file = %parquet_file.file_path, + file_size = parquet_file.file_size, + available = sht.available_size, + used = sht.used_size, + "reserving" + ); if !self.is_disk_available(parquet_file.file_size).await? || sht.available_size <= parquet_file.file_size { match phase { SyncPhase::Latest => { + warn!( + stream = %stream, + tenant = ?tenant_id, + file = %parquet_file.file_path, + file_size = parquet_file.file_size, + available = sht.available_size, + "tight on space; triggering eviction" + ); if !self .cleanup_hot_tier_old_data( stream, @@ -638,25 +745,62 @@ impl HotTierManager { ) .await? { + warn!( + stream = %stream, + tenant = ?tenant_id, + file = %parquet_file.file_path, + file_size = parquet_file.file_size, + "eviction freed nothing, skipping file" + ); return Ok(false); } } SyncPhase::Historic => { - // Historic never evicts; just skip when full. + warn!( + stream = %stream, + tenant = ?tenant_id, + file = %parquet_file.file_path, + file_size = parquet_file.file_size, + available = sht.available_size, + "historic phase: full, skipping file" + ); return Ok(false); } } } if sht.available_size <= parquet_file.file_size { + warn!( + stream = %stream, + tenant = ?tenant_id, + file = %parquet_file.file_path, + file_size = parquet_file.file_size, + available = sht.available_size, + "still no space after eviction, skipping" + ); return Ok(false); } sht.available_size -= parquet_file.file_size; self.put_hot_tier(stream, &mut sht, tenant_id).await?; + warn!( + stream = %stream, + tenant = ?tenant_id, + file = %parquet_file.file_path, + deducted = parquet_file.file_size, + new_available = sht.available_size, + "reserved" + ); } // DOWNLOAD (no lock held) let parquet_file_path = RelativePathBuf::from(parquet_file.file_path.clone()); fs::create_dir_all(parquet_path.parent().unwrap()).await?; + warn!( + stream = %stream, + tenant = ?tenant_id, + file = %parquet_file.file_path, + file_size = parquet_file.file_size, + "download starting" + ); let download_result = PARSEABLE .storage .get_object_store() @@ -664,6 +808,13 @@ impl HotTierManager { .await; if let Err(e) = download_result { + warn!( + stream = %stream, + tenant = ?tenant_id, + file = %parquet_file.file_path, + err = %e, + "download failed, refunding reservation" + ); // refund reservation let mut sht = state.sht.lock().await; sht.available_size += parquet_file.file_size; @@ -673,6 +824,13 @@ impl HotTierManager { // backend already cleaned up its `.partial` file; final path was never created. return Err(e.into()); } + warn!( + stream = %stream, + tenant = ?tenant_id, + file = %parquet_file.file_path, + file_size = parquet_file.file_size, + "download finished, committing" + ); // COMMIT { @@ -693,6 +851,14 @@ impl HotTierManager { .join("hottier.manifest.json"); fs::create_dir_all(manifest_path.parent().unwrap()).await?; fs::write(manifest_path, serde_json::to_vec(&hot_tier_manifest)?).await?; + warn!( + stream = %stream, + tenant = ?tenant_id, + file = %parquet_file.file_path, + used = sht.used_size, + available = sht.available_size, + "committed" + ); } Ok(true) @@ -872,7 +1038,15 @@ impl HotTierManager { parquet_file_size: u64, tenant_id: &Option, ) -> Result { + warn!( + stream = %stream, + tenant = ?tenant_id, + target_size = parquet_file_size, + available = stream_hot_tier.available_size, + "eviction starting" + ); let mut delete_successful = false; + let mut freed_total: u64 = 0; let dates = self.fetch_hot_tier_dates(stream, tenant_id).await?; 'loop_dates: for date in dates { let path = self.get_stream_path_for_date(stream, &date, tenant_id); @@ -905,6 +1079,13 @@ impl HotTierManager { extract_datetime(path_to_delete.to_str().unwrap()), ) && download_date_time <= delete_date_time { + warn!( + stream = %stream, + tenant = ?tenant_id, + candidate = %file_to_delete.file_path, + target = %download_file_path.display(), + "skip evict: candidate newer than target" + ); delete_successful = false; break 'loop_files; } @@ -922,6 +1103,16 @@ impl HotTierManager { self.put_hot_tier(stream, stream_hot_tier, tenant_id) .await?; delete_successful = true; + freed_total += file_size; + warn!( + stream = %stream, + tenant = ?tenant_id, + evicted_file = %file_to_delete.file_path, + evicted_size = file_size, + freed_total, + new_available = stream_hot_tier.available_size, + "evicted" + ); if stream_hot_tier.available_size <= parquet_file_size { continue 'loop_files; @@ -935,6 +1126,13 @@ impl HotTierManager { } } + warn!( + stream = %stream, + tenant = ?tenant_id, + freed_total, + success = delete_successful, + "eviction complete" + ); Ok(delete_successful) } From b5a348eedd0815856c40f0e776d4e09595314ea6 Mon Sep 17 00:00:00 2001 From: Anant Vindal Date: Wed, 6 May 2026 23:43:17 +0530 Subject: [PATCH 10/18] two loops instead of clockwerk --- src/hottier.rs | 88 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 53 insertions(+), 35 deletions(-) diff --git a/src/hottier.rs b/src/hottier.rs index ef69799d2..1a8385dc7 100644 --- a/src/hottier.rs +++ b/src/hottier.rs @@ -34,7 +34,6 @@ use crate::{ validator::error::HotTierValidationError, }; use chrono::NaiveDate; -use clokwerk::{AsyncScheduler, Interval, Job}; use futures::{StreamExt, TryStreamExt, stream::FuturesUnordered}; use futures_util::TryFutureExt; use object_store::{ObjectStoreExt, local::LocalFileSystem}; @@ -49,7 +48,6 @@ use tracing::{error, warn}; pub const STREAM_HOT_TIER_FILENAME: &str = ".hot_tier.json"; pub const MIN_STREAM_HOT_TIER_SIZE_BYTES: u64 = 10737418240; // 10 GiB -const HOT_TIER_SYNC_DURATION: Interval = clokwerk::Interval::Minutes(1); pub const INTERNAL_STREAM_HOT_TIER_SIZE_BYTES: u64 = 10485760; //10 MiB pub const CURRENT_HOT_TIER_VERSION: &str = "v2"; @@ -436,10 +434,11 @@ impl HotTierManager { Ok(path) } - /// schedule two hot-tier sync jobs: a frequent "latest" pass for files - /// within the last `P_HOT_TIER_LATEST_MINUTES`, and a less-frequent - /// "historic" pass for older files. Both share the per-stream lock; only - /// the latest pass triggers eviction. + /// Spawn two independent loops, one per phase, so Latest and Historic + /// tick clocks are independent. A long-running Historic backfill no + /// longer delays Latest ticks. Within each phase, the next iteration + /// starts only after the previous returns + a sleep, so a phase can + /// never overlap itself. pub fn download_from_s3<'a>(&'a self) -> Result<(), HotTierError> where 'a: 'static, @@ -452,31 +451,28 @@ impl HotTierManager { "hot tier scheduler starting" ); - let mut scheduler = AsyncScheduler::new(); - scheduler - .every(HOT_TIER_SYNC_DURATION) - .plus(Interval::Seconds(5)) - .run(move || async { + let latest_interval = Duration::from_secs(60); + let historic_interval = Duration::from_secs(historic_min as u64 * 60); + + let this = self; + tokio::spawn(async move { + loop { warn!(phase = ?SyncPhase::Latest, "hot tier tick fired"); - if let Err(err) = self.sync_hot_tier(SyncPhase::Latest).await { + if let Err(err) = this.sync_hot_tier(SyncPhase::Latest).await { error!("Error in hot tier latest scheduler: {:?}", err); } - }); + tokio::time::sleep(latest_interval).await; + } + }); - scheduler - .every(Interval::Minutes(historic_min)) - .plus(Interval::Seconds(15)) - .run(move || async { + let this = self; + tokio::spawn(async move { + loop { warn!(phase = ?SyncPhase::Historic, "hot tier tick fired"); - if let Err(err) = self.sync_hot_tier(SyncPhase::Historic).await { + if let Err(err) = this.sync_hot_tier(SyncPhase::Historic).await { error!("Error in hot tier historic scheduler: {:?}", err); } - }); - - tokio::spawn(async move { - loop { - scheduler.run_pending().await; - tokio::time::sleep(Duration::from_secs(10)).await; + tokio::time::sleep(historic_interval).await; } }); Ok(()) @@ -510,13 +506,18 @@ impl HotTierManager { } warn!(phase = ?phase, num_streams = scheduled, "hot tier tick: streams scheduled"); + let tick_start = std::time::Instant::now(); while let Some(res) = sync_hot_tier_tasks.next().await { if let Err(err) = res { error!("Failed to run hot tier sync task {err:?}"); return Err(err); } } - warn!(phase = ?phase, "hot tier tick complete"); + warn!( + phase = ?phase, + elapsed_ms = tick_start.elapsed().as_millis() as u64, + "hot tier tick complete" + ); Ok(()) } @@ -557,10 +558,17 @@ impl HotTierManager { "manifest fetched" ); + let stream_start = std::time::Instant::now(); self.process_manifest(&stream, &mut s3_manifest_file_list, &tenant_id, phase) .await?; - warn!(stream = %stream, tenant = ?tenant_id, phase = ?phase, "stream sync done"); + warn!( + stream = %stream, + tenant = ?tenant_id, + phase = ?phase, + elapsed_ms = stream_start.elapsed().as_millis() as u64, + "stream sync done" + ); Ok(()) } @@ -671,15 +679,14 @@ impl HotTierManager { phase, ) .await?; - if !processed - && !stop.swap(true, std::sync::atomic::Ordering::Relaxed) { - warn!( - stream = %stream, - tenant = ?tenant_id, - phase = ?phase, - "sticky stop: halting further reservations this tick" - ); - } + if !processed && !stop.swap(true, std::sync::atomic::Ordering::Relaxed) { + warn!( + stream = %stream, + tenant = ?tenant_id, + phase = ?phase, + "sticky stop: halting further reservations this tick" + ); + } Ok(()) } }) @@ -801,17 +808,20 @@ impl HotTierManager { file_size = parquet_file.file_size, "download starting" ); + let dl_start = std::time::Instant::now(); let download_result = PARSEABLE .storage .get_object_store() .buffered_write(&parquet_file_path, tenant_id, parquet_path.clone()) .await; + let dl_elapsed = dl_start.elapsed(); if let Err(e) = download_result { warn!( stream = %stream, tenant = ?tenant_id, file = %parquet_file.file_path, + elapsed_ms = dl_elapsed.as_millis() as u64, err = %e, "download failed, refunding reservation" ); @@ -824,11 +834,19 @@ impl HotTierManager { // backend already cleaned up its `.partial` file; final path was never created. return Err(e.into()); } + let elapsed_ms = dl_elapsed.as_millis() as u64; + let mbps = if dl_elapsed.as_secs_f64() > 0.0 { + (parquet_file.file_size as f64 * 8.0) / dl_elapsed.as_secs_f64() / 1_000_000.0 + } else { + 0.0 + }; warn!( stream = %stream, tenant = ?tenant_id, file = %parquet_file.file_path, file_size = parquet_file.file_size, + elapsed_ms, + mbps = format!("{mbps:.1}"), "download finished, committing" ); From 2b2fe6196b5b713146b6668ccf83792d59dc4855 Mon Sep 17 00:00:00 2001 From: Anant Vindal Date: Thu, 7 May 2026 15:34:04 +0530 Subject: [PATCH 11/18] per-stream hottier tasks --- src/handlers/http/logstream.rs | 3 + src/hottier.rs | 142 ++++++++++++++++++++------------- 2 files changed, 89 insertions(+), 56 deletions(-) diff --git a/src/handlers/http/logstream.rs b/src/handlers/http/logstream.rs index 86f83329c..1abe4e075 100644 --- a/src/handlers/http/logstream.rs +++ b/src/handlers/http/logstream.rs @@ -455,6 +455,9 @@ pub async fn put_stream_hot_tier( hot_tier_manager .put_hot_tier(&stream_name, &mut hottier, &tenant_id) .await?; + hot_tier_manager + .spawn_stream_tasks(stream_name.clone(), tenant_id.clone()) + .await; let mut stream_metadata: ObjectStoreFormat = serde_json::from_slice( &PARSEABLE diff --git a/src/hottier.rs b/src/hottier.rs index 1a8385dc7..9c7e0f6ed 100644 --- a/src/hottier.rs +++ b/src/hottier.rs @@ -81,10 +81,16 @@ enum SyncPhase { type StreamKey = (Option, String); +struct StreamTasks { + latest: tokio::task::JoinHandle<()>, + historic: tokio::task::JoinHandle<()>, +} + pub struct HotTierManager { filesystem: LocalFileSystem, hot_tier_path: &'static Path, state_cache: AsyncRwLock>>, + tasks: AsyncRwLock>, } impl HotTierManager { @@ -94,6 +100,7 @@ impl HotTierManager { filesystem: LocalFileSystem::new(), hot_tier_path, state_cache: AsyncRwLock::new(HashMap::new()), + tasks: AsyncRwLock::new(HashMap::new()), } } @@ -388,6 +395,9 @@ impl HotTierManager { if !self.check_stream_hot_tier_exists(stream, tenant_id) { return Err(HotTierValidationError::NotFound(stream.to_owned()).into()); } + // Stop loops before tearing down the directory so no in-flight tick + // re-creates files mid-delete. + self.abort_stream_tasks(stream, tenant_id).await; let path = if let Some(tenant_id) = tenant_id.as_ref() { self.hot_tier_path.join(tenant_id).join(stream) } else { @@ -434,11 +444,9 @@ impl HotTierManager { Ok(path) } - /// Spawn two independent loops, one per phase, so Latest and Historic - /// tick clocks are independent. A long-running Historic backfill no - /// longer delays Latest ticks. Within each phase, the next iteration - /// starts only after the previous returns + a sleep, so a phase can - /// never overlap itself. + /// Discover hot-tier-enabled streams at boot and spawn a per-stream pair + /// of (Latest, Historic) loops for each. New streams added later acquire + /// their own loops via `spawn_stream_tasks` from the PUT hot-tier handler. pub fn download_from_s3<'a>(&'a self) -> Result<(), HotTierError> where 'a: 'static, @@ -451,74 +459,96 @@ impl HotTierManager { "hot tier scheduler starting" ); + let this: &'static HotTierManager = self; + tokio::spawn(async move { + // pstats hot tier may need to be created on boot before any tasks + // can pick it up. + if let Err(e) = this.create_pstats_hot_tier().await { + tracing::error!("Skipping pstats hot tier creation because of error: {e}"); + } + let tenants = if let Some(tenants) = PARSEABLE.list_tenants() { + tenants.into_iter().map(Some).collect::>() + } else { + vec![None] + }; + for tenant_id in tenants { + for stream in PARSEABLE.streams.list(&tenant_id) { + if this.check_stream_hot_tier_exists(&stream, &tenant_id) { + this.spawn_stream_tasks(stream, tenant_id.clone()).await; + } + } + } + }); + Ok(()) + } + + /// Spawn (Latest, Historic) loops for a single stream. Idempotent: + /// if tasks already exist for this (tenant, stream), no-op. + pub async fn spawn_stream_tasks(&'static self, stream: String, tenant_id: Option) { + let key: StreamKey = (tenant_id.clone(), stream.clone()); + { + let tasks = self.tasks.read().await; + if let Some(existing) = tasks.get(&key) + && !existing.latest.is_finished() + && !existing.historic.is_finished() + { + return; + } + } + let latest_interval = Duration::from_secs(60); - let historic_interval = Duration::from_secs(historic_min as u64 * 60); + let historic_interval = Duration::from_secs( + PARSEABLE.options.hot_tier_historic_sync_minutes.max(1) as u64 * 60, + ); - let this = self; - tokio::spawn(async move { + warn!(stream = %stream, tenant = ?tenant_id, "spawning per-stream hot tier tasks"); + + let s = stream.clone(); + let t = tenant_id.clone(); + let latest = tokio::spawn(async move { loop { - warn!(phase = ?SyncPhase::Latest, "hot tier tick fired"); - if let Err(err) = this.sync_hot_tier(SyncPhase::Latest).await { - error!("Error in hot tier latest scheduler: {:?}", err); + warn!(stream = %s, tenant = ?t, phase = ?SyncPhase::Latest, "stream tick fired"); + if let Err(err) = self + .process_stream(s.clone(), t.clone(), SyncPhase::Latest) + .await + { + error!(stream = %s, "latest sync error: {err:?}"); } tokio::time::sleep(latest_interval).await; } }); - let this = self; - tokio::spawn(async move { + let s = stream.clone(); + let t = tenant_id.clone(); + let historic = tokio::spawn(async move { loop { - warn!(phase = ?SyncPhase::Historic, "hot tier tick fired"); - if let Err(err) = this.sync_hot_tier(SyncPhase::Historic).await { - error!("Error in hot tier historic scheduler: {:?}", err); + warn!(stream = %s, tenant = ?t, phase = ?SyncPhase::Historic, "stream tick fired"); + if let Err(err) = self + .process_stream(s.clone(), t.clone(), SyncPhase::Historic) + .await + { + error!(stream = %s, "historic sync error: {err:?}"); } tokio::time::sleep(historic_interval).await; } }); - Ok(()) - } - /// sync the hot tier files from S3 to the hot tier directory for all streams - async fn sync_hot_tier(&self, phase: SyncPhase) -> Result<(), HotTierError> { - // Before syncing, check if pstats stream was created and needs hot tier - if let Err(e) = self.create_pstats_hot_tier().await { - tracing::trace!("Skipping pstats hot tier creation because of error: {e}"); + let mut tasks = self.tasks.write().await; + if let Some(old) = tasks.insert(key, StreamTasks { latest, historic }) { + old.latest.abort(); + old.historic.abort(); } + } - let mut sync_hot_tier_tasks = FuturesUnordered::new(); - let tenants = if let Some(tenants) = PARSEABLE.list_tenants() { - tenants.into_iter().map(Some).collect() - } else { - vec![None] - }; - let mut scheduled = 0usize; - for tenant_id in tenants { - for stream in PARSEABLE.streams.list(&tenant_id) { - if self.check_stream_hot_tier_exists(&stream, &tenant_id) { - sync_hot_tier_tasks.push(self.process_stream( - stream, - tenant_id.to_owned(), - phase, - )); - scheduled += 1; - } - } - } - warn!(phase = ?phase, num_streams = scheduled, "hot tier tick: streams scheduled"); - - let tick_start = std::time::Instant::now(); - while let Some(res) = sync_hot_tier_tasks.next().await { - if let Err(err) = res { - error!("Failed to run hot tier sync task {err:?}"); - return Err(err); - } + /// Abort and remove per-stream tasks. Caller must ensure no further work + /// will be enqueued for the stream after this returns. + async fn abort_stream_tasks(&self, stream: &str, tenant_id: &Option) { + let key: StreamKey = (tenant_id.clone(), stream.to_owned()); + if let Some(t) = self.tasks.write().await.remove(&key) { + t.latest.abort(); + t.historic.abort(); + warn!(stream = %stream, tenant = ?tenant_id, "aborted per-stream hot tier tasks"); } - warn!( - phase = ?phase, - elapsed_ms = tick_start.elapsed().as_millis() as u64, - "hot tier tick complete" - ); - Ok(()) } /// process the hot tier files for the stream From 43e437c704ad8599de6ebf4101197415fcb8a2c2 Mon Sep 17 00:00:00 2001 From: Anant Vindal Date: Fri, 8 May 2026 12:54:48 +0530 Subject: [PATCH 12/18] fix: deepsource, coderabbit suggestions --- src/handlers/http/logstream.rs | 7 +- src/hottier.rs | 241 +++++++++++++++++++-------------- src/storage/azure_blob.rs | 48 ++++--- src/storage/s3.rs | 5 +- 4 files changed, 171 insertions(+), 130 deletions(-) diff --git a/src/handlers/http/logstream.rs b/src/handlers/http/logstream.rs index 1abe4e075..74af54d98 100644 --- a/src/handlers/http/logstream.rs +++ b/src/handlers/http/logstream.rs @@ -455,9 +455,6 @@ pub async fn put_stream_hot_tier( hot_tier_manager .put_hot_tier(&stream_name, &mut hottier, &tenant_id) .await?; - hot_tier_manager - .spawn_stream_tasks(stream_name.clone(), tenant_id.clone()) - .await; let mut stream_metadata: ObjectStoreFormat = serde_json::from_slice( &PARSEABLE @@ -472,7 +469,9 @@ pub async fn put_stream_hot_tier( .metastore .put_stream_json(&stream_metadata, &stream_name, &tenant_id) .await?; - + hot_tier_manager + .spawn_stream_tasks(stream_name.clone(), tenant_id.clone()) + .await; Ok(( format!("hot tier set for stream {stream_name}"), StatusCode::OK, diff --git a/src/hottier.rs b/src/hottier.rs index 9c7e0f6ed..f9dbafb43 100644 --- a/src/hottier.rs +++ b/src/hottier.rs @@ -16,6 +16,7 @@ * */ +use datafusion::common::HashSet; use std::{ collections::{BTreeMap, HashMap}, io, @@ -44,7 +45,7 @@ use std::time::Duration; use sysinfo::Disks; use tokio::fs::{self, DirEntry}; use tokio_stream::wrappers::ReadDirStream; -use tracing::{error, warn}; +use tracing::{error, info}; pub const STREAM_HOT_TIER_FILENAME: &str = ".hot_tier.json"; pub const MIN_STREAM_HOT_TIER_SIZE_BYTES: u64 = 10737418240; // 10 GiB @@ -144,7 +145,7 @@ impl HotTierManager { stream: &str, tenant_id: &Option, ) -> Result { - warn!(stream = %stream, tenant = ?tenant_id, "reconcile starting"); + info!(stream = %stream, tenant = ?tenant_id, "reconcile starting"); let mut sht = self.get_hot_tier(stream, tenant_id).await?; let dates = self.fetch_hot_tier_dates(stream, tenant_id).await?; let mut total_used: u64 = 0; @@ -158,84 +159,36 @@ impl HotTierManager { continue; } + let mut on_disk: HashSet = HashSet::new(); + // Pass 1: collect on-disk parquet files (drop .partial orphans). - let mut on_disk: std::collections::HashSet = std::collections::HashSet::new(); - let mut entries = fs::read_dir(&date_dir).await?; - while let Some(entry) = entries.next_entry().await? { - let p = entry.path(); - let Some(name_os) = p.file_name() else { - continue; - }; - let name = name_os.to_string_lossy(); - if name.ends_with(".partial") { - let _ = fs::remove_file(&p).await; - partials_removed += 1; - warn!( - stream = %stream, - tenant = ?tenant_id, - path = %p.display(), - "reconcile: deleted partial orphan" - ); - continue; - } - if name.ends_with(".manifest.json") { - continue; - } - if !p.is_file() { - continue; - } - on_disk.insert(name.into_owned()); - } + self.drop_partials( + &mut on_disk, + &date_dir, + &mut partials_removed, + stream, + tenant_id, + ) + .await?; // Pass 2: clean manifest of stale entries. - let manifest_path = date_dir.join("hottier.manifest.json"); - let mut manifest: Manifest = if manifest_path.exists() { - let bytes = fs::read(&manifest_path).await?; - serde_json::from_slice(&bytes).unwrap_or_default() - } else { - Manifest::default() - }; - - let mut kept = Vec::with_capacity(manifest.files.len()); - let mut keep_names: std::collections::HashSet = - std::collections::HashSet::new(); - for f in manifest.files.drain(..) { - let local = self.hot_tier_path.join(&f.file_path); - let ok = match fs::metadata(&local).await { - Ok(m) => m.len() == f.file_size, - Err(_) => false, - }; - if ok { - if let Some(name) = local.file_name().and_then(|s| s.to_str()) { - keep_names.insert(name.to_owned()); - } - total_used += f.file_size; - kept.push(f); - } else { - let _ = fs::remove_file(&local).await; - entries_dropped += 1; - warn!( - stream = %stream, - tenant = ?tenant_id, - file = %f.file_path, - "reconcile: dropped manifest entry (file missing or wrong size)" - ); - } - } - kept.sort_by_key(|f| f.file_path.clone()); - manifest.files = kept; - - if manifest_path.exists() || !manifest.files.is_empty() { - fs::create_dir_all(&date_dir).await?; - fs::write(&manifest_path, serde_json::to_vec(&manifest)?).await?; - } + let mut keep_names: HashSet = HashSet::new(); + self.clean_manifest( + &mut keep_names, + &date_dir, + &mut total_used, + &mut entries_dropped, + stream, + tenant_id, + ) + .await?; // Pass 3: delete on-disk parquet files not referenced by the cleaned manifest. for name in on_disk.difference(&keep_names) { let p = date_dir.join(name); let _ = fs::remove_file(&p).await; orphans_removed += 1; - warn!( + info!( stream = %stream, tenant = ?tenant_id, file = %p.display(), @@ -247,7 +200,7 @@ impl HotTierManager { sht.used_size = total_used; sht.available_size = sht.size.saturating_sub(total_used); self.put_hot_tier(stream, &mut sht, tenant_id).await?; - warn!( + info!( stream = %stream, tenant = ?tenant_id, partials_removed, @@ -260,6 +213,94 @@ impl HotTierManager { Ok(sht) } + async fn drop_partials( + &self, + on_disk: &mut HashSet, + date_dir: &PathBuf, + partials_removed: &mut usize, + stream: &str, + tenant_id: &Option, + ) -> Result<(), HotTierError> { + let mut entries = fs::read_dir(date_dir).await?; + while let Some(entry) = entries.next_entry().await? { + let p = entry.path(); + let Some(name_os) = p.file_name() else { + continue; + }; + let name = name_os.to_string_lossy(); + if name.ends_with(".partial") { + let _ = fs::remove_file(&p).await; + *partials_removed += 1; + info!( + stream = %stream, + tenant = ?tenant_id, + path = %p.display(), + "reconcile: deleted partial orphan" + ); + continue; + } + if name.ends_with(".manifest.json") { + continue; + } + if !p.is_file() { + continue; + } + on_disk.insert(name.into_owned()); + } + Ok(()) + } + + async fn clean_manifest( + &self, + keep_names: &mut HashSet, + date_dir: &PathBuf, + total_used: &mut u64, + entries_dropped: &mut usize, + stream: &str, + tenant_id: &Option, + ) -> Result<(), HotTierError> { + let manifest_path = date_dir.join("hottier.manifest.json"); + let mut manifest: Manifest = if manifest_path.exists() { + let bytes = fs::read(&manifest_path).await?; + serde_json::from_slice(&bytes).unwrap_or_default() + } else { + Manifest::default() + }; + + let mut kept = Vec::with_capacity(manifest.files.len()); + for f in manifest.files.drain(..) { + let local = self.hot_tier_path.join(&f.file_path); + let ok = match fs::metadata(&local).await { + Ok(m) => m.len() == f.file_size, + Err(_) => false, + }; + if ok { + if let Some(name) = local.file_name().and_then(|s| s.to_str()) { + keep_names.insert(name.to_owned()); + } + *total_used += f.file_size; + kept.push(f); + } else { + let _ = fs::remove_file(&local).await; + *entries_dropped += 1; + info!( + stream = %stream, + tenant = ?tenant_id, + file = %f.file_path, + "reconcile: dropped manifest entry (file missing or wrong size)" + ); + } + } + kept.sort_by_key(|f| f.file_path.clone()); + manifest.files = kept; + + if manifest_path.exists() || !manifest.files.is_empty() { + fs::create_dir_all(&date_dir).await?; + fs::write(&manifest_path, serde_json::to_vec(&manifest)?).await?; + } + Ok(()) + } + /// Get a global pub fn global() -> Option<&'static HotTierManager> { static INSTANCE: OnceCell = OnceCell::new(); @@ -453,7 +494,7 @@ impl HotTierManager { { let latest_min = PARSEABLE.options.hot_tier_latest_minutes; let historic_min = PARSEABLE.options.hot_tier_historic_sync_minutes.max(1); - warn!( + info!( latest_minutes = latest_min, historic_sync_minutes = historic_min, "hot tier scheduler starting" @@ -501,13 +542,13 @@ impl HotTierManager { PARSEABLE.options.hot_tier_historic_sync_minutes.max(1) as u64 * 60, ); - warn!(stream = %stream, tenant = ?tenant_id, "spawning per-stream hot tier tasks"); + info!(stream = %stream, tenant = ?tenant_id, "spawning per-stream hot tier tasks"); let s = stream.clone(); let t = tenant_id.clone(); let latest = tokio::spawn(async move { loop { - warn!(stream = %s, tenant = ?t, phase = ?SyncPhase::Latest, "stream tick fired"); + info!(stream = %s, tenant = ?t, phase = ?SyncPhase::Latest, "stream tick fired"); if let Err(err) = self .process_stream(s.clone(), t.clone(), SyncPhase::Latest) .await @@ -522,7 +563,7 @@ impl HotTierManager { let t = tenant_id.clone(); let historic = tokio::spawn(async move { loop { - warn!(stream = %s, tenant = ?t, phase = ?SyncPhase::Historic, "stream tick fired"); + info!(stream = %s, tenant = ?t, phase = ?SyncPhase::Historic, "stream tick fired"); if let Err(err) = self .process_stream(s.clone(), t.clone(), SyncPhase::Historic) .await @@ -547,7 +588,7 @@ impl HotTierManager { if let Some(t) = self.tasks.write().await.remove(&key) { t.latest.abort(); t.historic.abort(); - warn!(stream = %stream, tenant = ?tenant_id, "aborted per-stream hot tier tasks"); + info!(stream = %stream, tenant = ?tenant_id, "aborted per-stream hot tier tasks"); } } @@ -559,7 +600,7 @@ impl HotTierManager { tenant_id: Option, phase: SyncPhase, ) -> Result<(), HotTierError> { - warn!( + info!( stream = %stream, tenant = ?tenant_id, phase = ?phase, @@ -579,7 +620,7 @@ impl HotTierManager { .values() .map(|m| m.iter().map(|x| x.files.len()).sum::()) .sum(); - warn!( + info!( stream = %stream, tenant = ?tenant_id, phase = ?phase, @@ -592,7 +633,7 @@ impl HotTierManager { self.process_manifest(&stream, &mut s3_manifest_file_list, &tenant_id, phase) .await?; - warn!( + info!( stream = %stream, tenant = ?tenant_id, phase = ?phase, @@ -628,7 +669,7 @@ impl HotTierManager { match NaiveDate::parse_from_str(str_date.trim_start_matches("date="), "%Y-%m-%d") { Ok(d) => d, Err(_) => { - warn!("Invalid date format: {}", str_date); + info!("Invalid date format: {}", str_date); continue; } }; @@ -658,7 +699,7 @@ impl HotTierManager { let work_count = work.len(); let total_bytes: u64 = work.iter().map(|(_, f, _)| f.file_size).sum(); if work.is_empty() { - warn!( + info!( stream = %stream, tenant = ?tenant_id, phase = ?phase, @@ -666,7 +707,7 @@ impl HotTierManager { ); return Ok(()); } - warn!( + info!( stream = %stream, tenant = ?tenant_id, phase = ?phase, @@ -710,7 +751,7 @@ impl HotTierManager { ) .await?; if !processed && !stop.swap(true, std::sync::atomic::Ordering::Relaxed) { - warn!( + info!( stream = %stream, tenant = ?tenant_id, phase = ?phase, @@ -749,7 +790,7 @@ impl HotTierManager { // RESERVE { let mut sht = state.sht.lock().await; - warn!( + info!( stream = %stream, tenant = ?tenant_id, phase = ?phase, @@ -760,11 +801,11 @@ impl HotTierManager { "reserving" ); if !self.is_disk_available(parquet_file.file_size).await? - || sht.available_size <= parquet_file.file_size + || sht.available_size < parquet_file.file_size { match phase { SyncPhase::Latest => { - warn!( + info!( stream = %stream, tenant = ?tenant_id, file = %parquet_file.file_path, @@ -782,7 +823,7 @@ impl HotTierManager { ) .await? { - warn!( + info!( stream = %stream, tenant = ?tenant_id, file = %parquet_file.file_path, @@ -793,7 +834,7 @@ impl HotTierManager { } } SyncPhase::Historic => { - warn!( + info!( stream = %stream, tenant = ?tenant_id, file = %parquet_file.file_path, @@ -805,8 +846,8 @@ impl HotTierManager { } } } - if sht.available_size <= parquet_file.file_size { - warn!( + if sht.available_size < parquet_file.file_size { + info!( stream = %stream, tenant = ?tenant_id, file = %parquet_file.file_path, @@ -818,7 +859,7 @@ impl HotTierManager { } sht.available_size -= parquet_file.file_size; self.put_hot_tier(stream, &mut sht, tenant_id).await?; - warn!( + info!( stream = %stream, tenant = ?tenant_id, file = %parquet_file.file_path, @@ -831,7 +872,7 @@ impl HotTierManager { // DOWNLOAD (no lock held) let parquet_file_path = RelativePathBuf::from(parquet_file.file_path.clone()); fs::create_dir_all(parquet_path.parent().unwrap()).await?; - warn!( + info!( stream = %stream, tenant = ?tenant_id, file = %parquet_file.file_path, @@ -847,7 +888,7 @@ impl HotTierManager { let dl_elapsed = dl_start.elapsed(); if let Err(e) = download_result { - warn!( + info!( stream = %stream, tenant = ?tenant_id, file = %parquet_file.file_path, @@ -870,7 +911,7 @@ impl HotTierManager { } else { 0.0 }; - warn!( + info!( stream = %stream, tenant = ?tenant_id, file = %parquet_file.file_path, @@ -899,7 +940,7 @@ impl HotTierManager { .join("hottier.manifest.json"); fs::create_dir_all(manifest_path.parent().unwrap()).await?; fs::write(manifest_path, serde_json::to_vec(&hot_tier_manifest)?).await?; - warn!( + info!( stream = %stream, tenant = ?tenant_id, file = %parquet_file.file_path, @@ -1086,7 +1127,7 @@ impl HotTierManager { parquet_file_size: u64, tenant_id: &Option, ) -> Result { - warn!( + info!( stream = %stream, tenant = ?tenant_id, target_size = parquet_file_size, @@ -1127,7 +1168,7 @@ impl HotTierManager { extract_datetime(path_to_delete.to_str().unwrap()), ) && download_date_time <= delete_date_time { - warn!( + info!( stream = %stream, tenant = ?tenant_id, candidate = %file_to_delete.file_path, @@ -1152,7 +1193,7 @@ impl HotTierManager { .await?; delete_successful = true; freed_total += file_size; - warn!( + info!( stream = %stream, tenant = ?tenant_id, evicted_file = %file_to_delete.file_path, @@ -1174,7 +1215,7 @@ impl HotTierManager { } } - warn!( + info!( stream = %stream, tenant = ?tenant_id, freed_total, diff --git a/src/storage/azure_blob.rs b/src/storage/azure_blob.rs index 52a4668aa..1f81a847c 100644 --- a/src/storage/azure_blob.rs +++ b/src/storage/azure_blob.rs @@ -268,32 +268,30 @@ impl BlobStore { let client = self.client.clone(); let semaphore = Arc::new(tokio::sync::Semaphore::new(concurrency)); - let mut handles = Vec::with_capacity(ranges.len()); - for r in ranges { - let client = client.clone(); - let src = src.clone(); - let std_file = std_file.clone(); - let semaphore = semaphore.clone(); - handles.push(tokio::spawn(async move { - let _permit = semaphore - .acquire_owned() + futures::stream::iter(ranges) + .map(|r| { + let client = client.clone(); + let src = src.clone(); + let std_file = std_file.clone(); + let semaphore = semaphore.clone(); + async move { + let _permit = semaphore.acquire_owned().await.map_err(|e| { + ObjectStorageError::Custom(format!("semaphore closed: {e}")) + })?; + let bytes = client.get_range(&src, r.clone()).await?; + let offset = r.start; + tokio::task::spawn_blocking(move || -> std::io::Result<()> { + use std::os::unix::fs::FileExt; + std_file.write_all_at(&bytes, offset) + }) .await - .map_err(|e| ObjectStorageError::Custom(format!("semaphore closed: {e}")))?; - let bytes = client.get_range(&src, r.clone()).await?; - let offset = r.start; - tokio::task::spawn_blocking(move || -> std::io::Result<()> { - use std::os::unix::fs::FileExt; - std_file.write_all_at(&bytes, offset) - }) - .await - .map_err(|e| ObjectStorageError::Custom(format!("join: {e}")))??; - Ok::<_, ObjectStorageError>(()) - })); - } - for h in handles { - h.await - .map_err(|e| ObjectStorageError::Custom(format!("join error: {e}")))??; - } + .map_err(|e| ObjectStorageError::Custom(format!("join: {e}")))??; + Ok::<_, ObjectStorageError>(()) + } + }) + .buffer_unordered(concurrency) + .try_collect::>() + .await?; let std_file_sync = std_file.clone(); tokio::task::spawn_blocking(move || std_file_sync.sync_all()) diff --git a/src/storage/s3.rs b/src/storage/s3.rs index 1ba95ef76..51f979ff8 100644 --- a/src/storage/s3.rs +++ b/src/storage/s3.rs @@ -351,7 +351,10 @@ impl S3 { .await { Ok(()) => { - tokio::fs::rename(&partial, &write_path).await?; + if let Err(e) = tokio::fs::rename(&partial, &write_path).await { + let _ = tokio::fs::remove_file(&partial).await; + return Err(e.into()); + } Ok(()) } Err(e) => { From b185b38c89e20c1d5a6145c5514827f8b347ebbe Mon Sep 17 00:00:00 2001 From: Anant Vindal Date: Fri, 8 May 2026 16:18:21 +0530 Subject: [PATCH 13/18] hottier deletion bug --- src/hottier.rs | 132 +++++++++++++++++++++++++++++++------------------ 1 file changed, 85 insertions(+), 47 deletions(-) diff --git a/src/hottier.rs b/src/hottier.rs index f9dbafb43..883f610a5 100644 --- a/src/hottier.rs +++ b/src/hottier.rs @@ -857,7 +857,20 @@ impl HotTierManager { ); return Ok(false); } - sht.available_size -= parquet_file.file_size; + sht.available_size = + if let Some(val) = sht.available_size.checked_sub(parquet_file.file_size) { + val + } else { + tracing::error!( + stream = %stream, + tenant = ?tenant_id, + file = %parquet_file.file_path, + file_size = parquet_file.file_size, + available = sht.available_size, + "file_size > sht.available_size, setting available_size to 0 and moving on" + ); + 0 + }; self.put_hot_tier(stream, &mut sht, tenant_id).await?; info!( stream = %stream, @@ -1113,7 +1126,7 @@ impl HotTierManager { path.exists() } - ///delete the parquet file from the hot tier directory for the stream + /// delete entire parquet file minute from the hot tier directory for the stream /// loop through all manifests in the hot tier directory for the stream /// loop through all parquet files in the manifest /// check for the oldest entry to delete if the path exists in hot tier @@ -1155,61 +1168,86 @@ impl HotTierManager { let file = fs::read(manifest_file.path()).await?; let mut manifest: Manifest = serde_json::from_slice(&file)?; - manifest.files.sort_by_key(|file| file.file_path.clone()); - manifest.files.reverse(); - - 'loop_files: while let Some(file_to_delete) = manifest.files.pop() { - let file_size = file_to_delete.file_size; - let path_to_delete = self.hot_tier_path.join(&file_to_delete.file_path); - - if path_to_delete.exists() { - if let (Some(download_date_time), Some(delete_date_time)) = ( - extract_datetime(download_file_path.to_str().unwrap()), - extract_datetime(path_to_delete.to_str().unwrap()), - ) && download_date_time <= delete_date_time - { - info!( - stream = %stream, - tenant = ?tenant_id, - candidate = %file_to_delete.file_path, - target = %download_file_path.display(), - "skip evict: candidate newer than target" - ); - delete_successful = false; - break 'loop_files; - } - - fs::write(manifest_file.path(), serde_json::to_vec(&manifest)?).await?; + if manifest.files.is_empty() { + continue; + } - fs::remove_dir_all(path_to_delete.parent().unwrap()).await?; - delete_empty_directory_hot_tier( - path_to_delete.parent().unwrap().to_path_buf(), - ) - .await?; + // sort in an ascending manner + // idx0: minute=00 + // idx59: minute=59 + manifest.files.sort_by_key(|file| file.file_path.clone()); - stream_hot_tier.used_size -= file_size; - stream_hot_tier.available_size += file_size; - self.put_hot_tier(stream, stream_hot_tier, tenant_id) - .await?; - delete_successful = true; - freed_total += file_size; + // get first file's parent (/hottier/stream/date=d/hour=h/minute=m) + let first_file = manifest.files.first().unwrap(); + let first_file_path = self.hot_tier_path.join(&first_file.file_path); + let minute_to_delete = first_file_path.parent().unwrap(); + + if minute_to_delete.exists() { + if let (Some(download_date_time), Some(delete_date_time)) = ( + extract_datetime(download_file_path.to_str().unwrap()), + extract_datetime(first_file_path.to_str().unwrap()), + ) && download_date_time <= delete_date_time + { info!( stream = %stream, tenant = ?tenant_id, - evicted_file = %file_to_delete.file_path, - evicted_size = file_size, - freed_total, - new_available = stream_hot_tier.available_size, - "evicted" + candidate = %minute_to_delete.display(), + target = %download_file_path.display(), + "skip evict: candidate newer than target" ); + continue; + } - if stream_hot_tier.available_size <= parquet_file_size { - continue 'loop_files; + let minute_to_delete_owned = minute_to_delete.to_path_buf(); + let mut minute_freed: u64 = 0; + manifest.files.retain(|file| { + let file_path = self.hot_tier_path.join(&file.file_path); + let file_minute = file_path.parent().unwrap(); + if file_minute == minute_to_delete_owned { + minute_freed = minute_freed.saturating_add(file.file_size); + false } else { - break 'loop_dates; + true } + }); + + stream_hot_tier.used_size = stream_hot_tier + .used_size + .checked_sub(minute_freed) + .unwrap_or_else(|| { + tracing::error!( + stream = %stream, + tenant = ?tenant_id, + minute = %minute_to_delete_owned.display(), + minute_freed, + used_size = stream_hot_tier.used_size, + "minute_freed > used_size, clamping used_size to 0" + ); + 0 + }); + stream_hot_tier.available_size = + stream_hot_tier.available_size.saturating_add(minute_freed); + freed_total = freed_total.saturating_add(minute_freed); + + fs::write(manifest_file.path(), serde_json::to_vec(&manifest)?).await?; + fs::remove_dir_all(&minute_to_delete_owned).await?; + delete_empty_directory_hot_tier(minute_to_delete_owned.clone()).await?; + self.put_hot_tier(stream, stream_hot_tier, tenant_id) + .await?; + delete_successful = true; + info!( + stream = %stream, + tenant = ?tenant_id, + evicted_minute = %minute_to_delete_owned.display(), + evicted_size = minute_freed, + freed_total, + new_available = stream_hot_tier.available_size, + "evicted" + ); + if stream_hot_tier.available_size <= parquet_file_size { + continue; } else { - fs::write(manifest_file.path(), serde_json::to_vec(&manifest)?).await?; + break 'loop_dates; } } } From b07cc3d4a54888fd88e8699c95cbe97a14cd0050 Mon Sep 17 00:00:00 2001 From: Anant Vindal Date: Sun, 10 May 2026 18:49:03 +0530 Subject: [PATCH 14/18] x86 build + other fixes --- src/cli.rs | 8 +- src/hottier.rs | 163 ++++++++++++++++++++++++---------- src/storage/azure_blob.rs | 21 +++-- src/storage/gcs.rs | 65 +++++++------- src/storage/localfs.rs | 2 +- src/storage/mod.rs | 38 ++++++++ src/storage/object_storage.rs | 2 +- src/storage/s3.rs | 60 ++++++------- 8 files changed, 232 insertions(+), 127 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index 6b5e9d7e4..07ae2e843 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -317,6 +317,7 @@ pub struct Options { #[arg( long = "hot-tier-download-chunk-size", env = "P_HOT_TIER_DOWNLOAD_CHUNK_SIZE", + value_parser = clap::value_parser!(u64).range(5242880..), default_value = "8388608", help = "Chunk size in bytes for parallel hot tier downloads (default 8 MiB)" )] @@ -325,10 +326,11 @@ pub struct Options { #[arg( long = "hot-tier-download-concurrency", env = "P_HOT_TIER_DOWNLOAD_CONCURRENCY", + value_parser = clap::value_parser!(u64).range(1..), default_value = "16", help = "Number of concurrent range requests per hot tier download" )] - pub hot_tier_download_concurrency: usize, + pub hot_tier_download_concurrency: u64, #[arg( long = "hot-tier-files-per-stream-concurrency", @@ -341,14 +343,16 @@ pub struct Options { #[arg( long = "hot-tier-latest-minutes", env = "P_HOT_TIER_LATEST_MINUTES", + value_parser = clap::value_parser!(u64).range(1..), default_value = "10", help = "Files whose timestamp is within the last N minutes are 'latest'; rest are 'historic'." )] - pub hot_tier_latest_minutes: i64, + pub hot_tier_latest_minutes: u64, #[arg( long = "hot-tier-historic-sync-minutes", env = "P_HOT_TIER_HISTORIC_SYNC_MINUTES", + value_parser = clap::value_parser!(u32).range(1..), default_value = "5", help = "Interval (minutes) at which the historic hot-tier sync runs." )] diff --git a/src/hottier.rs b/src/hottier.rs index 883f610a5..20bacbd34 100644 --- a/src/hottier.rs +++ b/src/hottier.rs @@ -116,15 +116,19 @@ impl HotTierManager { if let Some(state) = self.state_cache.read().await.get(&key).cloned() { return Ok(state); } - let mut cache = self.state_cache.write().await; - if let Some(state) = cache.get(&key).cloned() { - return Ok(state); - } + // key not present, reconcile let sht = self.reconcile_stream(stream, tenant_id).await?; let state = Arc::new(StreamSyncState { sht: AsyncMutex::new(sht), }); - cache.insert(key, state.clone()); + + let mut cache = self.state_cache.write().await; + if cache.insert(key, state.clone()).is_some() { + tracing::warn!( + "Key- {:?} was absent during read lock but already exists after reconcile!", + (tenant_id, stream), + ); + }; Ok(state) } @@ -221,31 +225,41 @@ impl HotTierManager { stream: &str, tenant_id: &Option, ) -> Result<(), HotTierError> { - let mut entries = fs::read_dir(date_dir).await?; - while let Some(entry) = entries.next_entry().await? { - let p = entry.path(); - let Some(name_os) = p.file_name() else { - continue; - }; - let name = name_os.to_string_lossy(); - if name.ends_with(".partial") { - let _ = fs::remove_file(&p).await; - *partials_removed += 1; - info!( - stream = %stream, - tenant = ?tenant_id, - path = %p.display(), - "reconcile: deleted partial orphan" - ); - continue; - } - if name.ends_with(".manifest.json") { - continue; - } - if !p.is_file() { - continue; + let mut stack: Vec = vec![date_dir.clone()]; + while let Some(dir) = stack.pop() { + let mut entries = fs::read_dir(&dir).await?; + while let Some(entry) = entries.next_entry().await? { + let p = entry.path(); + let ft = entry.file_type().await?; + if ft.is_dir() { + stack.push(p); + continue; + } + let Some(name_os) = p.file_name() else { + continue; + }; + let name = name_os.to_string_lossy(); + if name.ends_with(".partial") { + let _ = fs::remove_file(&p).await; + *partials_removed += 1; + info!( + stream = %stream, + tenant = ?tenant_id, + path = %p.display(), + "reconcile: deleted partial orphan" + ); + continue; + } + if name.ends_with(".manifest.json") { + continue; + } + if !ft.is_file() { + continue; + } + if let Ok(rel) = p.strip_prefix(date_dir) { + on_disk.insert(rel.to_string_lossy().into_owned()); + } } - on_disk.insert(name.into_owned()); } Ok(()) } @@ -275,8 +289,8 @@ impl HotTierManager { Err(_) => false, }; if ok { - if let Some(name) = local.file_name().and_then(|s| s.to_str()) { - keep_names.insert(name.to_owned()); + if let Ok(rel) = local.strip_prefix(date_dir) { + keep_names.insert(rel.to_string_lossy().into_owned()); } *total_used += f.file_size; kept.push(f); @@ -493,7 +507,7 @@ impl HotTierManager { 'a: 'static, { let latest_min = PARSEABLE.options.hot_tier_latest_minutes; - let historic_min = PARSEABLE.options.hot_tier_historic_sync_minutes.max(1); + let historic_min = PARSEABLE.options.hot_tier_historic_sync_minutes; info!( latest_minutes = latest_min, historic_sync_minutes = historic_min, @@ -516,6 +530,21 @@ impl HotTierManager { for stream in PARSEABLE.streams.list(&tenant_id) { if this.check_stream_hot_tier_exists(&stream, &tenant_id) { this.spawn_stream_tasks(stream, tenant_id.clone()).await; + } else { + // check for potential orphan directory on disk + let path = if let Some(tenant_id) = tenant_id.as_ref() { + self.hot_tier_path.join(tenant_id).join(stream) + } else { + self.hot_tier_path.join(stream) + }; + if path.exists() { + // delete this entire folder as stream meta says no hottier for stream + if let Err(e) = fs::remove_dir_all(&path).await { + tracing::error!( + "Unable to remove orphaned hottier dir- `{path:?}` with error- {e}" + ); + }; + } } } } @@ -538,9 +567,8 @@ impl HotTierManager { } let latest_interval = Duration::from_secs(60); - let historic_interval = Duration::from_secs( - PARSEABLE.options.hot_tier_historic_sync_minutes.max(1) as u64 * 60, - ); + let historic_interval = + Duration::from_secs(PARSEABLE.options.hot_tier_historic_sync_minutes as u64 * 60); info!(stream = %stream, tenant = ?tenant_id, "spawning per-stream hot tier tasks"); @@ -588,6 +616,12 @@ impl HotTierManager { if let Some(t) = self.tasks.write().await.remove(&key) { t.latest.abort(); t.historic.abort(); + + // run these tasks till completion or till JoinError + // post this, we delete the folders so its better if these tasks complete + // and then deletion happens + let _ = t.latest.await; + let _ = t.historic.await; info!(stream = %stream, tenant = ?tenant_id, "aborted per-stream hot tier tasks"); } } @@ -658,8 +692,9 @@ impl HotTierManager { return Ok(()); } - let latest_minutes = PARSEABLE.options.hot_tier_latest_minutes.max(0); - let cutoff = chrono::Utc::now().naive_utc() - chrono::Duration::minutes(latest_minutes); + let latest_minutes = PARSEABLE.options.hot_tier_latest_minutes; + let cutoff = chrono::Utc::now().naive_utc() + - chrono::Duration::minutes(latest_minutes.try_into().unwrap()); // Build flat work list: (date, file, local_path) ordered latest-date-first, // and within a date by descending file_path (matches old `pop()` order). @@ -717,10 +752,7 @@ impl HotTierManager { ); let state = self.get_or_load_state(stream, tenant_id).await?; - let concurrency = PARSEABLE - .options - .hot_tier_files_per_stream_concurrency - .max(4); + let concurrency = PARSEABLE.options.hot_tier_files_per_stream_concurrency; // Reservation failure (out of disk + nothing to evict) is sticky: // once one file can't be placed, no subsequent file will fit either. @@ -896,7 +928,7 @@ impl HotTierManager { let download_result = PARSEABLE .storage .get_object_store() - .buffered_write(&parquet_file_path, tenant_id, parquet_path.clone()) + .parallel_chunked_download(&parquet_file_path, tenant_id, parquet_path.clone()) .await; let dl_elapsed = dl_start.elapsed(); @@ -1150,9 +1182,23 @@ impl HotTierManager { let mut delete_successful = false; let mut freed_total: u64 = 0; let dates = self.fetch_hot_tier_dates(stream, tenant_id).await?; + if dates.is_empty() { + info!( + stream = %stream, + tenant = ?tenant_id, + "eviction: no date dirs found, nothing to evict" + ); + } 'loop_dates: for date in dates { let path = self.get_stream_path_for_date(stream, &date, tenant_id); if !path.exists() { + info!( + stream = %stream, + tenant = ?tenant_id, + date = %date, + path = %path.display(), + "eviction: date path missing, skipping" + ); continue; } @@ -1164,11 +1210,27 @@ impl HotTierManager { .to_string_lossy() .ends_with(".manifest.json") }); + if manifest_files.is_empty() { + info!( + stream = %stream, + tenant = ?tenant_id, + date = %date, + path = %path.display(), + "eviction: no .manifest.json files in date dir" + ); + continue; + } for manifest_file in manifest_files { let file = fs::read(manifest_file.path()).await?; let mut manifest: Manifest = serde_json::from_slice(&file)?; if manifest.files.is_empty() { + info!( + stream = %stream, + tenant = ?tenant_id, + manifest = %manifest_file.path().display(), + "eviction: manifest has zero file entries" + ); continue; } @@ -1182,7 +1244,18 @@ impl HotTierManager { let first_file_path = self.hot_tier_path.join(&first_file.file_path); let minute_to_delete = first_file_path.parent().unwrap(); - if minute_to_delete.exists() { + if !minute_to_delete.exists() { + info!( + stream = %stream, + tenant = ?tenant_id, + manifest = %manifest_file.path().display(), + first_file = %first_file.file_path, + minute = %minute_to_delete.display(), + "eviction: minute dir referenced by manifest does not exist on disk" + ); + continue; + } + { if let (Some(download_date_time), Some(delete_date_time)) = ( extract_datetime(download_file_path.to_str().unwrap()), extract_datetime(first_file_path.to_str().unwrap()), @@ -1244,7 +1317,7 @@ impl HotTierManager { new_available = stream_hot_tier.available_size, "evicted" ); - if stream_hot_tier.available_size <= parquet_file_size { + if stream_hot_tier.available_size < parquet_file_size { continue; } else { break 'loop_dates; @@ -1263,7 +1336,7 @@ impl HotTierManager { Ok(delete_successful) } - ///check if the disk is available to download the parquet file + /// check if the disk is available to download the parquet file /// check if the disk usage is above the threshold pub async fn is_disk_available(&self, size_to_download: u64) -> Result { if let Some(DiskUtil { diff --git a/src/storage/azure_blob.rs b/src/storage/azure_blob.rs index 1f81a847c..7bf6c033a 100644 --- a/src/storage/azure_blob.rs +++ b/src/storage/azure_blob.rs @@ -224,7 +224,10 @@ impl BlobStore { .await { Ok(()) => { - tokio::fs::rename(&partial, &write_path).await?; + if let Err(e) = tokio::fs::rename(&partial, &write_path).await { + let _ = tokio::fs::remove_file(&partial).await; + return Err(e.into()); + } Ok(()) } Err(e) => { @@ -255,18 +258,15 @@ impl BlobStore { file.set_len(total).await?; let std_file = Arc::new(file.into_std().await); - let chunk = PARSEABLE - .options - .hot_tier_download_chunk_size - .max(8 * 1024 * 1024); - let concurrency = PARSEABLE.options.hot_tier_download_concurrency.max(6); + let chunk = PARSEABLE.options.hot_tier_download_chunk_size; + let concurrency = PARSEABLE.options.hot_tier_download_concurrency; let ranges: Vec> = (0..total) .step_by(chunk as usize) .map(|s| s..(s + chunk).min(total)) .collect(); let chunk_count = ranges.len() as u64; let client = self.client.clone(); - let semaphore = Arc::new(tokio::sync::Semaphore::new(concurrency)); + let semaphore = Arc::new(tokio::sync::Semaphore::new(concurrency as usize)); futures::stream::iter(ranges) .map(|r| { @@ -281,15 +281,14 @@ impl BlobStore { let bytes = client.get_range(&src, r.clone()).await?; let offset = r.start; tokio::task::spawn_blocking(move || -> std::io::Result<()> { - use std::os::unix::fs::FileExt; - std_file.write_all_at(&bytes, offset) + crate::storage::write_all_at(&std_file, &bytes, offset) }) .await .map_err(|e| ObjectStorageError::Custom(format!("join: {e}")))??; Ok::<_, ObjectStorageError>(()) } }) - .buffer_unordered(concurrency) + .buffer_unordered(concurrency as usize) .try_collect::>() .await?; @@ -572,7 +571,7 @@ impl BlobStore { #[async_trait] impl ObjectStorage for BlobStore { - async fn buffered_write( + async fn parallel_chunked_download( &self, path: &RelativePath, tenant_id: &Option, diff --git a/src/storage/gcs.rs b/src/storage/gcs.rs index 75dd141b6..de1f8e578 100644 --- a/src/storage/gcs.rs +++ b/src/storage/gcs.rs @@ -190,7 +190,10 @@ impl Gcs { .await { Ok(()) => { - tokio::fs::rename(&partial, &write_path).await?; + if let Err(e) = tokio::fs::rename(&partial, &write_path).await { + let _ = tokio::fs::remove_file(&partial).await; + return Err(e.into()); + } Ok(()) } Err(e) => { @@ -221,45 +224,39 @@ impl Gcs { file.set_len(total).await?; let std_file = Arc::new(file.into_std().await); - let chunk = PARSEABLE - .options - .hot_tier_download_chunk_size - .max(8 * 1024 * 1024); - let concurrency = PARSEABLE.options.hot_tier_download_concurrency.max(16); + let chunk = PARSEABLE.options.hot_tier_download_chunk_size; + let concurrency = PARSEABLE.options.hot_tier_download_concurrency; let ranges: Vec> = (0..total) .step_by(chunk as usize) .map(|s| s..(s + chunk).min(total)) .collect(); let chunk_count = ranges.len() as u64; let client = self.client.clone(); - let semaphore = Arc::new(tokio::sync::Semaphore::new(concurrency)); - - let mut handles = Vec::with_capacity(ranges.len()); - for r in ranges { - let client = client.clone(); - let src = src.clone(); - let std_file = std_file.clone(); - let semaphore = semaphore.clone(); - handles.push(tokio::spawn(async move { - let _permit = semaphore - .acquire_owned() + let semaphore = Arc::new(tokio::sync::Semaphore::new(concurrency as usize)); + + futures::stream::iter(ranges) + .map(|r| { + let client = client.clone(); + let src = src.clone(); + let std_file = std_file.clone(); + let semaphore = semaphore.clone(); + async move { + let _permit = semaphore.acquire_owned().await.map_err(|e| { + ObjectStorageError::Custom(format!("semaphore closed: {e}")) + })?; + let bytes = client.get_range(&src, r.clone()).await?; + let offset = r.start; + tokio::task::spawn_blocking(move || -> std::io::Result<()> { + crate::storage::write_all_at(&std_file, &bytes, offset) + }) .await - .map_err(|e| ObjectStorageError::Custom(format!("semaphore closed: {e}")))?; - let bytes = client.get_range(&src, r.clone()).await?; - let offset = r.start; - tokio::task::spawn_blocking(move || -> std::io::Result<()> { - use std::os::unix::fs::FileExt; - std_file.write_all_at(&bytes, offset) - }) - .await - .map_err(|e| ObjectStorageError::Custom(format!("join: {e}")))??; - Ok::<_, ObjectStorageError>(()) - })); - } - for h in handles { - h.await - .map_err(|e| ObjectStorageError::Custom(format!("join error: {e}")))??; - } + .map_err(|e| ObjectStorageError::Custom(format!("join: {e}")))??; + Ok::<_, ObjectStorageError>(()) + } + }) + .buffer_unordered(concurrency as usize) + .try_collect::>() + .await?; let std_file_sync = std_file.clone(); tokio::task::spawn_blocking(move || std_file_sync.sync_all()) @@ -540,7 +537,7 @@ impl Gcs { #[async_trait] impl ObjectStorage for Gcs { - async fn buffered_write( + async fn parallel_chunked_download( &self, path: &RelativePath, tenant_id: &Option, diff --git a/src/storage/localfs.rs b/src/storage/localfs.rs index 8d6027413..97d909eb6 100644 --- a/src/storage/localfs.rs +++ b/src/storage/localfs.rs @@ -106,7 +106,7 @@ impl LocalFS { #[async_trait] impl ObjectStorage for LocalFS { - async fn buffered_write( + async fn parallel_chunked_download( &self, path: &RelativePath, _tenant_id: &Option, diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 15dd98975..df120cacb 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -48,6 +48,44 @@ pub mod retention; mod s3; pub mod store_metadata; +/// Cross-platform positional write: pwrite(2) on Unix, seek_write+loop on Windows. +/// Both APIs accept `&File`, so concurrent ranged downloads can share an Arc. +#[inline(always)] +pub(crate) fn write_all_at(file: &std::fs::File, buf: &[u8], offset: u64) -> std::io::Result<()> { + #[cfg(unix)] + { + use std::os::unix::fs::FileExt; + file.write_all_at(buf, offset) + } + #[cfg(windows)] + { + use std::os::windows::fs::FileExt; + let mut buf = buf; + let mut offset = offset; + while !buf.is_empty() { + match file.seek_write(buf, offset) { + Ok(0) => { + return Err(std::io::Error::new( + std::io::ErrorKind::WriteZero, + "failed to write whole buffer", + )); + } + Ok(n) => { + buf = &buf[n..]; + offset += n as u64; + } + Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {} + Err(e) => return Err(e), + } + } + Ok(()) + } + #[cfg(not(any(unix, windows)))] + { + compile_error!("write_all_at: unsupported platform"); + } +} + use self::retention::Retention; pub use azure_blob::AzureBlobConfig; pub use gcs::GcsConfig; diff --git a/src/storage/object_storage.rs b/src/storage/object_storage.rs index 2c6244733..3e5ccef03 100644 --- a/src/storage/object_storage.rs +++ b/src/storage/object_storage.rs @@ -302,7 +302,7 @@ pub trait ObjectStorage: Debug + Send + Sync + 'static { path: &RelativePath, tenant_id: &Option, ) -> Result; - async fn buffered_write( + async fn parallel_chunked_download( &self, path: &RelativePath, tenant_id: &Option, diff --git a/src/storage/s3.rs b/src/storage/s3.rs index 51f979ff8..98ca91036 100644 --- a/src/storage/s3.rs +++ b/src/storage/s3.rs @@ -385,45 +385,39 @@ impl S3 { file.set_len(total).await?; let std_file = Arc::new(file.into_std().await); - let chunk = PARSEABLE - .options - .hot_tier_download_chunk_size - .max(8 * 1024 * 1024); - let concurrency = PARSEABLE.options.hot_tier_download_concurrency.max(16); + let chunk = PARSEABLE.options.hot_tier_download_chunk_size; + let concurrency = PARSEABLE.options.hot_tier_download_concurrency; let ranges: Vec> = (0..total) .step_by(chunk as usize) .map(|s| s..(s + chunk).min(total)) .collect(); let chunk_count = ranges.len() as u64; let client = Arc::new(self.client.clone()); - let semaphore = Arc::new(tokio::sync::Semaphore::new(concurrency)); - - let mut handles = Vec::with_capacity(ranges.len()); - for r in ranges { - let client = client.clone(); - let src = src.clone(); - let std_file = std_file.clone(); - let semaphore = semaphore.clone(); - handles.push(tokio::spawn(async move { - let _permit = semaphore - .acquire_owned() + let semaphore = Arc::new(tokio::sync::Semaphore::new(concurrency as usize)); + + futures::stream::iter(ranges) + .map(|r| { + let client = client.clone(); + let src = src.clone(); + let std_file = std_file.clone(); + let semaphore = semaphore.clone(); + async move { + let _permit = semaphore.acquire_owned().await.map_err(|e| { + ObjectStorageError::Custom(format!("semaphore closed: {e}")) + })?; + let bytes = client.get_range(&src, r.clone()).await?; + let offset = r.start; + tokio::task::spawn_blocking(move || -> std::io::Result<()> { + crate::storage::write_all_at(&std_file, &bytes, offset) + }) .await - .map_err(|e| ObjectStorageError::Custom(format!("semaphore closed: {e}")))?; - let bytes = client.get_range(&src, r.clone()).await?; - let offset = r.start; - tokio::task::spawn_blocking(move || -> std::io::Result<()> { - use std::os::unix::fs::FileExt; - std_file.write_all_at(&bytes, offset) - }) - .await - .map_err(|e| ObjectStorageError::Custom(format!("join: {e}")))??; - Ok::<_, ObjectStorageError>(()) - })); - } - for h in handles { - h.await - .map_err(|e| ObjectStorageError::Custom(format!("join error: {e}")))??; - } + .map_err(|e| ObjectStorageError::Custom(format!("join: {e}")))??; + Ok::<_, ObjectStorageError>(()) + } + }) + .buffer_unordered(concurrency as usize) + .try_collect::>() + .await?; let std_file_sync = std_file.clone(); tokio::task::spawn_blocking(move || std_file_sync.sync_all()) @@ -797,7 +791,7 @@ impl ObjectStorage for S3 { Ok(result?) } - async fn buffered_write( + async fn parallel_chunked_download( &self, path: &RelativePath, tenant_id: &Option, From 8af9f0ff087e11b14e4498991618db713530f020 Mon Sep 17 00:00:00 2001 From: Anant Vindal Date: Mon, 11 May 2026 17:38:08 +0530 Subject: [PATCH 15/18] add traces to hottier --- src/handlers/http/logstream.rs | 24 +++ src/handlers/http/modal/mod.rs | 1 + src/hottier.rs | 290 +++++++++++++++++++++++++++------ src/metrics/mod.rs | 15 ++ src/storage/azure_blob.rs | 67 ++++++++ src/storage/gcs.rs | 67 ++++++++ src/storage/metrics_layer.rs | 41 ++++- src/storage/object_storage.rs | 49 ++++++ src/storage/s3.rs | 75 +++++++++ src/sync.rs | 60 ++++--- src/telemetry.rs | 4 +- 11 files changed, 616 insertions(+), 77 deletions(-) diff --git a/src/handlers/http/logstream.rs b/src/handlers/http/logstream.rs index 74af54d98..4c38ffee2 100644 --- a/src/handlers/http/logstream.rs +++ b/src/handlers/http/logstream.rs @@ -413,6 +413,11 @@ pub async fn get_stream_info( Ok((web::Json(stream_info), StatusCode::OK)) } +#[tracing::instrument( + name = "http.put_stream_hot_tier", + skip(req, logstream, hottier), + fields(stream = tracing::field::Empty, tenant = tracing::field::Empty, size = hottier.size) +)] pub async fn put_stream_hot_tier( req: HttpRequest, logstream: Path, @@ -420,6 +425,9 @@ pub async fn put_stream_hot_tier( ) -> Result { let stream_name = logstream.into_inner(); let tenant_id = get_tenant_id_from_request(&req); + tracing::Span::current() + .record("stream", tracing::field::display(&stream_name)) + .record("tenant", tracing::field::debug(&tenant_id)); // For query mode, if the stream not found in memory map, //check if it exists in the storage //create stream and schema from storage @@ -478,12 +486,20 @@ pub async fn put_stream_hot_tier( )) } +#[tracing::instrument( + name = "http.get_stream_hot_tier", + skip(req, logstream), + fields(stream = tracing::field::Empty, tenant = tracing::field::Empty) +)] pub async fn get_stream_hot_tier( req: HttpRequest, logstream: Path, ) -> Result { let stream_name = logstream.into_inner(); let tenant_id = get_tenant_id_from_request(&req); + tracing::Span::current() + .record("stream", tracing::field::display(&stream_name)) + .record("tenant", tracing::field::debug(&tenant_id)); // For query mode, if the stream not found in memory map, //check if it exists in the storage //create stream and schema from storage @@ -504,12 +520,20 @@ pub async fn get_stream_hot_tier( Ok((web::Json(meta), StatusCode::OK)) } +#[tracing::instrument( + name = "http.delete_stream_hot_tier", + skip(req, logstream), + fields(stream = tracing::field::Empty, tenant = tracing::field::Empty) +)] pub async fn delete_stream_hot_tier( req: HttpRequest, logstream: Path, ) -> Result { let stream_name = logstream.into_inner(); let tenant_id = get_tenant_id_from_request(&req); + tracing::Span::current() + .record("stream", tracing::field::display(&stream_name)) + .record("tenant", tracing::field::debug(&tenant_id)); // For query mode, if the stream not found in memory map, //check if it exists in the storage //create stream and schema from storage diff --git a/src/handlers/http/modal/mod.rs b/src/handlers/http/modal/mod.rs index 77622aca5..d7a936c5f 100644 --- a/src/handlers/http/modal/mod.rs +++ b/src/handlers/http/modal/mod.rs @@ -626,6 +626,7 @@ pub type PrismMetadata = NodeMetadata; /// Initialize hot tier metadata files for streams that have hot tier configuration /// in their stream metadata but don't have local hot tier metadata files yet. /// This function is called once during query server startup. +#[tracing::instrument(name = "hottier.init_metadata_startup", skip(hot_tier_manager))] pub async fn initialize_hot_tier_metadata_on_startup( hot_tier_manager: &HotTierManager, ) -> anyhow::Result<()> { diff --git a/src/hottier.rs b/src/hottier.rs index 20bacbd34..fae9f88e6 100644 --- a/src/hottier.rs +++ b/src/hottier.rs @@ -45,7 +45,7 @@ use std::time::Duration; use sysinfo::Disks; use tokio::fs::{self, DirEntry}; use tokio_stream::wrappers::ReadDirStream; -use tracing::{error, info}; +use tracing::{Instrument, error, info}; pub const STREAM_HOT_TIER_FILENAME: &str = ".hot_tier.json"; pub const MIN_STREAM_HOT_TIER_SIZE_BYTES: u64 = 10737418240; // 10 GiB @@ -144,6 +144,12 @@ impl HotTierManager { /// deletes parquet files that exist but are not in their date manifest, /// then recomputes `used_size` / `available_size` from the cleaned /// manifests and persists the updated `StreamHotTier`. + #[tracing::instrument( + name = "hottier.reconcile_stream", + skip(self), + fields(stream = %stream, tenant = ?tenant_id), + err + )] async fn reconcile_stream( &self, stream: &str, @@ -217,6 +223,11 @@ impl HotTierManager { Ok(sht) } + #[tracing::instrument( + name = "hottier.drop_partials", + skip(self, on_disk, partials_removed), + fields(stream = %stream, tenant = ?tenant_id, date_dir = %date_dir.display()) + )] async fn drop_partials( &self, on_disk: &mut HashSet, @@ -227,10 +238,33 @@ impl HotTierManager { ) -> Result<(), HotTierError> { let mut stack: Vec = vec![date_dir.clone()]; while let Some(dir) = stack.pop() { - let mut entries = fs::read_dir(&dir).await?; - while let Some(entry) = entries.next_entry().await? { + let mut entries = fs::read_dir(&dir).await.map_err(|e| { + error!( + stream = %stream, + tenant = ?tenant_id, + dir = ?dir, + error = ?e + ); + e + })?; + while let Some(entry) = entries.next_entry().await.map_err(|e| { + error!( + stream = %stream, + tenant = ?tenant_id, + error = ?e + ); + e + })? { let p = entry.path(); - let ft = entry.file_type().await?; + let ft = entry.file_type().await.map_err(|e| { + error!( + stream = %stream, + tenant = ?tenant_id, + entry = ?entry, + error = ?e + ); + e + })?; if ft.is_dir() { stack.push(p); continue; @@ -264,6 +298,11 @@ impl HotTierManager { Ok(()) } + #[tracing::instrument( + name = "hottier.clean_manifest", + skip(self, keep_names, total_used, entries_dropped), + fields(stream = %stream, tenant = ?tenant_id, date_dir = %date_dir.display()) + )] async fn clean_manifest( &self, keep_names: &mut HashSet, @@ -275,7 +314,15 @@ impl HotTierManager { ) -> Result<(), HotTierError> { let manifest_path = date_dir.join("hottier.manifest.json"); let mut manifest: Manifest = if manifest_path.exists() { - let bytes = fs::read(&manifest_path).await?; + let bytes = fs::read(&manifest_path).await.map_err(|e| { + error!( + stream = %stream, + tenant = ?tenant_id, + manifest_path = ?manifest_path, + error = ?e + ); + e + })?; serde_json::from_slice(&bytes).unwrap_or_default() } else { Manifest::default() @@ -310,7 +357,28 @@ impl HotTierManager { if manifest_path.exists() || !manifest.files.is_empty() { fs::create_dir_all(&date_dir).await?; - fs::write(&manifest_path, serde_json::to_vec(&manifest)?).await?; + fs::write( + &manifest_path, + serde_json::to_vec(&manifest).map_err(|e| { + error!( + stream = %stream, + tenant = ?tenant_id, + mainfest_path = ?manifest_path, + error = ?e + ); + e + })?, + ) + .await + .map_err(|e| { + error!( + stream = %stream, + tenant = ?tenant_id, + manifest_path = ?manifest_path, + error = ?e + ); + e + })?; } Ok(()) } @@ -327,6 +395,12 @@ impl HotTierManager { } /// get the total hot tier size for all streams + #[tracing::instrument( + name = "hottier.get_hot_tiers_size", + skip(self), + fields(current_stream = %current_stream, current_tenant = ?current_tenant_id), + err + )] pub async fn get_hot_tiers_size( &self, current_stream: &str, @@ -358,6 +432,12 @@ impl HotTierManager { /// check disk usage and hot tier size of all other streams /// check if total hot tier size of all streams is less than max disk usage /// delete all the files from hot tier once validation is successful and hot tier is ready to be updated + #[tracing::instrument( + name = "hottier.validate_size", + skip(self), + fields(stream = %stream, tenant = ?tenant_id, size = stream_hot_tier_size), + err + )] pub async fn validate_hot_tier_size( &self, stream: &str, @@ -420,6 +500,12 @@ impl HotTierManager { } /// get the hot tier metadata file for the stream + #[tracing::instrument( + name = "hottier.get_hot_tier", + skip(self), + fields(stream = %stream, tenant = ?tenant_id), + err + )] pub async fn get_hot_tier( &self, stream: &str, @@ -442,6 +528,12 @@ impl HotTierManager { Ok(stream_hot_tier) } + #[tracing::instrument( + name = "hottier.delete_hot_tier", + skip(self), + fields(stream = %stream, tenant = ?tenant_id), + err + )] pub async fn delete_hot_tier( &self, stream: &str, @@ -458,7 +550,14 @@ impl HotTierManager { } else { self.hot_tier_path.join(stream) }; - fs::remove_dir_all(path).await?; + fs::remove_dir_all(path).await.map_err(|e| { + error!( + stream = %stream, + tenant = ?tenant_id, + error = ?e + ); + e + })?; self.invalidate_state(stream, tenant_id).await; Ok(()) @@ -466,6 +565,12 @@ impl HotTierManager { /// put the hot tier metadata file for the stream /// set the updated_date_range in the hot tier metadata file + #[tracing::instrument( + name = "hottier.put_hot_tier", + skip(self, hot_tier), + fields(stream = %stream, tenant = ?tenant_id, size = hot_tier.size), + err + )] pub async fn put_hot_tier( &self, stream: &str, @@ -502,6 +607,7 @@ impl HotTierManager { /// Discover hot-tier-enabled streams at boot and spawn a per-stream pair /// of (Latest, Historic) loops for each. New streams added later acquire /// their own loops via `spawn_stream_tasks` from the PUT hot-tier handler. + #[tracing::instrument(name = "hottier.startup", skip(self), err)] pub fn download_from_s3<'a>(&'a self) -> Result<(), HotTierError> where 'a: 'static, @@ -515,45 +621,54 @@ impl HotTierManager { ); let this: &'static HotTierManager = self; - tokio::spawn(async move { - // pstats hot tier may need to be created on boot before any tasks - // can pick it up. - if let Err(e) = this.create_pstats_hot_tier().await { - tracing::error!("Skipping pstats hot tier creation because of error: {e}"); - } - let tenants = if let Some(tenants) = PARSEABLE.list_tenants() { - tenants.into_iter().map(Some).collect::>() - } else { - vec![None] - }; - for tenant_id in tenants { - for stream in PARSEABLE.streams.list(&tenant_id) { - if this.check_stream_hot_tier_exists(&stream, &tenant_id) { - this.spawn_stream_tasks(stream, tenant_id.clone()).await; - } else { - // check for potential orphan directory on disk - let path = if let Some(tenant_id) = tenant_id.as_ref() { - self.hot_tier_path.join(tenant_id).join(stream) + let startup_span = tracing::info_span!("hottier.startup.bootstrap"); + tokio::spawn( + async move { + // pstats hot tier may need to be created on boot before any tasks + // can pick it up. + if let Err(e) = this.create_pstats_hot_tier().await { + tracing::error!("Skipping pstats hot tier creation because of error: {e}"); + } + let tenants = if let Some(tenants) = PARSEABLE.list_tenants() { + tenants.into_iter().map(Some).collect::>() + } else { + vec![None] + }; + for tenant_id in tenants { + for stream in PARSEABLE.streams.list(&tenant_id) { + if this.check_stream_hot_tier_exists(&stream, &tenant_id) { + this.spawn_stream_tasks(stream, tenant_id.clone()).await; } else { - self.hot_tier_path.join(stream) - }; - if path.exists() { - // delete this entire folder as stream meta says no hottier for stream - if let Err(e) = fs::remove_dir_all(&path).await { - tracing::error!( - "Unable to remove orphaned hottier dir- `{path:?}` with error- {e}" - ); + // check for potential orphan directory on disk + let path = if let Some(tenant_id) = tenant_id.as_ref() { + self.hot_tier_path.join(tenant_id).join(stream) + } else { + self.hot_tier_path.join(stream) }; + if path.exists() { + // delete this entire folder as stream meta says no hottier for stream + if let Err(e) = fs::remove_dir_all(&path).await { + tracing::error!( + "Unable to remove orphaned hottier dir- `{path:?}` with error- {e}" + ); + }; + } } } } } - }); + .instrument(startup_span), + ); Ok(()) } /// Spawn (Latest, Historic) loops for a single stream. Idempotent: /// if tasks already exist for this (tenant, stream), no-op. + #[tracing::instrument( + name = "hottier.spawn_stream_tasks", + skip(self), + fields(stream = %stream, tenant = ?tenant_id) + )] pub async fn spawn_stream_tasks(&'static self, stream: String, tenant_id: Option) { let key: StreamKey = (tenant_id.clone(), stream.clone()); { @@ -576,13 +691,23 @@ impl HotTierManager { let t = tenant_id.clone(); let latest = tokio::spawn(async move { loop { - info!(stream = %s, tenant = ?t, phase = ?SyncPhase::Latest, "stream tick fired"); - if let Err(err) = self - .process_stream(s.clone(), t.clone(), SyncPhase::Latest) - .await - { - error!(stream = %s, "latest sync error: {err:?}"); + let tick_span = tracing::info_span!( + "hottier.tick", + stream = %s, + tenant = ?t, + phase = "latest" + ); + async { + info!("stream tick fired"); + if let Err(err) = self + .process_stream(s.clone(), t.clone(), SyncPhase::Latest) + .await + { + error!("latest sync error: {err:?}"); + } } + .instrument(tick_span) + .await; tokio::time::sleep(latest_interval).await; } }); @@ -591,13 +716,23 @@ impl HotTierManager { let t = tenant_id.clone(); let historic = tokio::spawn(async move { loop { - info!(stream = %s, tenant = ?t, phase = ?SyncPhase::Historic, "stream tick fired"); - if let Err(err) = self - .process_stream(s.clone(), t.clone(), SyncPhase::Historic) - .await - { - error!(stream = %s, "historic sync error: {err:?}"); + let tick_span = tracing::info_span!( + "hottier.tick", + stream = %s, + tenant = ?t, + phase = "historic" + ); + async { + info!("stream tick fired"); + if let Err(err) = self + .process_stream(s.clone(), t.clone(), SyncPhase::Historic) + .await + { + error!("historic sync error: {err:?}"); + } } + .instrument(tick_span) + .await; tokio::time::sleep(historic_interval).await; } }); @@ -628,6 +763,12 @@ impl HotTierManager { /// process the hot tier files for the stream /// delete the files from the hot tier directory if the available date range is outside the hot tier range + #[tracing::instrument( + name = "hottier.process_stream", + skip(self), + fields(stream = %stream, tenant = ?tenant_id, phase = ?phase), + err + )] async fn process_stream( &self, stream: String, @@ -645,6 +786,12 @@ impl HotTierManager { .get_all_manifest_files(&stream, &tenant_id) .await .map_err(|e| { + error!( + stream = %stream, + tenant = ?tenant_id, + phase = ?phase, + error = ?e + ); HotTierError::ObjectStorageError(ObjectStorageError::MetastoreError(Box::new( e.to_detail(), ))) @@ -665,7 +812,16 @@ impl HotTierManager { let stream_start = std::time::Instant::now(); self.process_manifest(&stream, &mut s3_manifest_file_list, &tenant_id, phase) - .await?; + .await + .map_err(|e| { + error!( + stream = %stream, + tenant = ?tenant_id, + phase = ?phase, + error = ?e + ); + e + })?; info!( stream = %stream, @@ -681,6 +837,11 @@ impl HotTierManager { /// collect all manifests from metastore for the date, sort the parquet file list /// in order to download the latest files first /// download the parquet files if not present in hot tier directory + #[tracing::instrument( + name = "hottier.process_manifest", + skip(self, manifest_files_to_download), + fields(stream = %stream, tenant = ?tenant_id, phase = ?phase, dates = manifest_files_to_download.len()) + )] async fn process_manifest( &self, stream: &str, @@ -809,6 +970,19 @@ impl HotTierManager { /// Returns false when no budget is available (caller should stop scheduling /// further work for this stream). #[allow(clippy::too_many_arguments)] + #[tracing::instrument( + name = "hottier.process_parquet_file", + skip(self, parquet_file, parquet_path, state), + fields( + stream = %stream, + tenant = ?tenant_id, + phase = ?phase, + date = %date, + file = %parquet_file.file_path, + file_size = parquet_file.file_size + ), + err + )] async fn process_parquet_file_concurrent( &self, stream: &str, @@ -999,6 +1173,12 @@ impl HotTierManager { } ///fetch the list of dates available in the hot tier directory for the stream and sort them + #[tracing::instrument( + name = "hottier.fetch_dates", + skip(self), + fields(stream = %stream, tenant = ?tenant_id), + err + )] pub async fn fetch_hot_tier_dates( &self, stream: &str, @@ -1112,6 +1292,12 @@ impl HotTierManager { } ///get the list of parquet files from the hot tier directory for the stream + #[tracing::instrument( + name = "hottier.get_parquet_files", + skip(self), + fields(stream = %stream, tenant = ?tenant_id), + err + )] pub async fn get_hot_tier_parquet_files( &self, stream: &str, @@ -1164,6 +1350,12 @@ impl HotTierManager { /// check for the oldest entry to delete if the path exists in hot tier /// update the used and available size in the hot tier metadata /// loop if available size is still less than the parquet file size + #[tracing::instrument( + name = "hottier.cleanup_old_data", + skip(self, stream_hot_tier, download_file_path), + fields(stream = %stream, tenant = ?tenant_id, target_size = parquet_file_size), + err + )] pub async fn cleanup_hot_tier_old_data( &self, stream: &str, @@ -1408,6 +1600,7 @@ impl HotTierManager { Ok(None) } + #[tracing::instrument(name = "hottier.put_internal_stream", skip(self), err)] pub async fn put_internal_stream_hot_tier(&self) -> Result<(), HotTierError> { let tenants = if let Some(tenants) = PARSEABLE.list_tenants() { tenants.into_iter().map(Some).collect() @@ -1439,6 +1632,7 @@ impl HotTierManager { } /// Creates hot tier for pstats internal stream if the stream exists in storage + #[tracing::instrument(name = "hottier.create_pstats", skip(self), err)] async fn create_pstats_hot_tier(&self) -> Result<(), HotTierError> { let tenants = if let Some(tenants) = PARSEABLE.list_tenants() { tenants.into_iter().map(Some).collect() diff --git a/src/metrics/mod.rs b/src/metrics/mod.rs index c7689863a..f00c8627a 100644 --- a/src/metrics/mod.rs +++ b/src/metrics/mod.rs @@ -389,6 +389,18 @@ pub static STORAGE_REQUEST_RESPONSE_TIME: Lazy = Lazy::new(|| { .expect("metric can be created") }); +pub static STORAGE_REQUESTS_INFLIGHT: Lazy = Lazy::new(|| { + IntGaugeVec::new( + Opts::new( + "storage_requests_inflight", + "Number of in-flight object store requests", + ) + .namespace(METRICS_NAMESPACE), + &["provider", "method"], + ) + .expect("metric can be created") +}); + pub static TOTAL_METRICS_COLLECTED_BY_DATE: Lazy = Lazy::new(|| { IntCounterVec::new( Opts::new( @@ -565,6 +577,9 @@ fn custom_metrics(registry: &Registry) { registry .register(Box::new(STORAGE_REQUEST_RESPONSE_TIME.clone())) .expect("metric can be registered"); + registry + .register(Box::new(STORAGE_REQUESTS_INFLIGHT.clone())) + .expect("metric can be registered"); registry .register(Box::new(TOTAL_METRICS_COLLECTED_BY_DATE.clone())) .expect("metric can be registered"); diff --git a/src/storage/azure_blob.rs b/src/storage/azure_blob.rs index 7bf6c033a..aab307c54 100644 --- a/src/storage/azure_blob.rs +++ b/src/storage/azure_blob.rs @@ -237,6 +237,12 @@ impl BlobStore { } } + #[tracing::instrument( + name = "azure.parallel_download", + skip(self, partial_path), + fields(path = %path, tenant = ?tenant_id, total_bytes = tracing::field::Empty, chunks = tracing::field::Empty), + err + )] async fn _parallel_download_inner( &self, path: &RelativePath, @@ -265,6 +271,9 @@ impl BlobStore { .map(|s| s..(s + chunk).min(total)) .collect(); let chunk_count = ranges.len() as u64; + tracing::Span::current() + .record("total_bytes", total) + .record("chunks", chunk_count); let client = self.client.clone(); let semaphore = Arc::new(tokio::sync::Semaphore::new(concurrency as usize)); @@ -309,6 +318,12 @@ impl BlobStore { Ok(()) } + #[tracing::instrument( + name = "azure.get_object", + skip(self), + fields(path = %path, tenant = ?tenant_id, bytes = tracing::field::Empty), + err + )] async fn _get_object( &self, path: &RelativePath, @@ -321,6 +336,7 @@ impl BlobStore { match resp { Ok(resp) => { let body: Bytes = resp.bytes().await?; + tracing::Span::current().record("bytes", body.len()); increment_files_scanned_in_object_store_calls_by_date( "GET", 1, @@ -339,6 +355,11 @@ impl BlobStore { } } + #[tracing::instrument( + name = "azure.put_object", + skip(self, resource), + fields(path = %path, tenant = ?tenant_id, bytes = resource.content_length()) + )] async fn _put_object( &self, path: &RelativePath, @@ -362,6 +383,12 @@ impl BlobStore { } } + #[tracing::instrument( + name = "azure.delete_prefix", + skip(self), + fields(prefix = %key, tenant = ?tenant_id, deleted = tracing::field::Empty, failed = tracing::field::Empty), + err + )] async fn _delete_prefix( &self, key: &str, @@ -392,6 +419,9 @@ impl BlobStore { } let total_files = files_deleted + failed_deletes; + tracing::Span::current() + .record("deleted", files_deleted) + .record("failed", failed_deletes); increment_files_scanned_in_object_store_calls_by_date( "LIST", total_files, @@ -414,6 +444,12 @@ impl BlobStore { Ok(()) } + #[tracing::instrument( + name = "azure.list_dates", + skip(self), + fields(stream = %stream, tenant = ?tenant_id, dates = tracing::field::Empty), + err + )] async fn _list_dates( &self, stream: &str, @@ -448,10 +484,16 @@ impl BlobStore { .filter_map(|path| path.as_ref().strip_prefix(&format!("{stream}/"))) .map(String::from) .collect(); + tracing::Span::current().record("dates", dates.len()); Ok(dates) } + #[tracing::instrument( + name = "azure.upload_file", + skip(self), + fields(key = %key, path = %path.display(), tenant = ?tenant_id) + )] async fn _upload_file( &self, key: &str, @@ -476,6 +518,11 @@ impl BlobStore { } } + #[tracing::instrument( + name = "azure.upload_multipart", + skip(self), + fields(key = %key, path = %path.display(), tenant = ?tenant_id, total_bytes = tracing::field::Empty) + )] async fn _upload_multipart( &self, key: &RelativePath, @@ -488,6 +535,7 @@ impl BlobStore { let meta = file.metadata().await?; let total_size = meta.len() as usize; + tracing::Span::current().record("total_bytes", total_size); let min_multipart_size = PARSEABLE.options.min_multipart_size as usize; if total_size < min_multipart_size || !PARSEABLE.options.enable_multipart { let mut data = Vec::new(); @@ -601,6 +649,12 @@ impl ObjectStorage for BlobStore { self._upload_multipart(key, path, tenant_id).await } + #[tracing::instrument( + name = "azure.head", + skip(self), + fields(path = %path, tenant = ?tenant_id), + err + )] async fn head( &self, path: &RelativePath, @@ -741,6 +795,12 @@ impl ObjectStorage for BlobStore { Ok(()) } + #[tracing::instrument( + name = "azure.delete_object", + skip(self), + fields(path = %path, tenant = ?tenant_id), + err + )] async fn delete_object( &self, path: &RelativePath, @@ -765,11 +825,18 @@ impl ObjectStorage for BlobStore { Ok(result?) } + #[tracing::instrument( + name = "azure.check", + skip(self), + fields(tenant = ?tenant_id, ok = tracing::field::Empty), + err + )] async fn check(&self, tenant_id: &Option) -> Result<(), ObjectStorageError> { let result = self .client .head(&to_object_store_path(&parseable_json_path())) .await; + tracing::Span::current().record("ok", result.is_ok()); let tenant = tenant_id.as_deref().unwrap_or(DEFAULT_TENANT); increment_object_store_calls_by_date("HEAD", &Utc::now().date_naive().to_string(), tenant); if result.is_ok() { diff --git a/src/storage/gcs.rs b/src/storage/gcs.rs index de1f8e578..fa3e8c550 100644 --- a/src/storage/gcs.rs +++ b/src/storage/gcs.rs @@ -203,6 +203,12 @@ impl Gcs { } } + #[tracing::instrument( + name = "gcs.parallel_download", + skip(self, partial_path), + fields(path = %path, tenant = ?tenant_id, total_bytes = tracing::field::Empty, chunks = tracing::field::Empty), + err + )] async fn _parallel_download_inner( &self, path: &RelativePath, @@ -231,6 +237,9 @@ impl Gcs { .map(|s| s..(s + chunk).min(total)) .collect(); let chunk_count = ranges.len() as u64; + tracing::Span::current() + .record("total_bytes", total) + .record("chunks", chunk_count); let client = self.client.clone(); let semaphore = Arc::new(tokio::sync::Semaphore::new(concurrency as usize)); @@ -275,6 +284,12 @@ impl Gcs { Ok(()) } + #[tracing::instrument( + name = "gcs.get_object", + skip(self), + fields(path = %path, tenant = ?tenant_id, bytes = tracing::field::Empty), + err + )] async fn _get_object( &self, path: &RelativePath, @@ -286,6 +301,7 @@ impl Gcs { match resp { Ok(resp) => { let body: Bytes = resp.bytes().await?; + tracing::Span::current().record("bytes", body.len()); increment_files_scanned_in_object_store_calls_by_date( "GET", 1, @@ -304,6 +320,11 @@ impl Gcs { } } + #[tracing::instrument( + name = "gcs.put_object", + skip(self, resource), + fields(path = %path, tenant = ?tenant_id, bytes = resource.content_length()) + )] async fn _put_object( &self, path: &RelativePath, @@ -327,6 +348,12 @@ impl Gcs { } } + #[tracing::instrument( + name = "gcs.delete_prefix", + skip(self), + fields(prefix = %key, tenant = ?tenant_id, deleted = tracing::field::Empty, failed = tracing::field::Empty), + err + )] async fn _delete_prefix( &self, key: &str, @@ -357,6 +384,9 @@ impl Gcs { } let total_files = files_deleted + failed_deletes; + tracing::Span::current() + .record("deleted", files_deleted) + .record("failed", failed_deletes); increment_files_scanned_in_object_store_calls_by_date( "LIST", total_files, @@ -379,6 +409,12 @@ impl Gcs { Ok(()) } + #[tracing::instrument( + name = "gcs.list_dates", + skip(self), + fields(stream = %stream, tenant = ?tenant_id, dates = tracing::field::Empty), + err + )] async fn _list_dates( &self, stream: &str, @@ -413,10 +449,16 @@ impl Gcs { .filter_map(|path| path.as_ref().strip_prefix(&format!("{stream}/"))) .map(String::from) .collect(); + tracing::Span::current().record("dates", dates.len()); Ok(dates) } + #[tracing::instrument( + name = "gcs.upload_file", + skip(self), + fields(key = %key, path = %path.display(), tenant = ?tenant_id) + )] async fn _upload_file( &self, key: &str, @@ -441,6 +483,11 @@ impl Gcs { } } + #[tracing::instrument( + name = "gcs.upload_multipart", + skip(self), + fields(key = %key, path = %path.display(), tenant = ?tenant_id, total_bytes = tracing::field::Empty) + )] async fn _upload_multipart( &self, key: &RelativePath, @@ -452,6 +499,7 @@ impl Gcs { let tenant = tenant_id.as_deref().unwrap_or(DEFAULT_TENANT); let meta = file.metadata().await?; let total_size = meta.len() as usize; + tracing::Span::current().record("total_bytes", total_size); let min_multipart_size = PARSEABLE.options.min_multipart_size as usize; if total_size < min_multipart_size || !PARSEABLE.options.enable_multipart { let mut data = Vec::new(); @@ -583,6 +631,12 @@ impl ObjectStorage for Gcs { self._upload_multipart(key, path, tenant_id).await } + #[tracing::instrument( + name = "gcs.head", + skip(self), + fields(path = %path, tenant = ?tenant_id), + err + )] async fn head( &self, path: &RelativePath, @@ -723,6 +777,12 @@ impl ObjectStorage for Gcs { Ok(()) } + #[tracing::instrument( + name = "gcs.delete_object", + skip(self), + fields(path = %path, tenant = ?tenant_id), + err + )] async fn delete_object( &self, path: &RelativePath, @@ -747,11 +807,18 @@ impl ObjectStorage for Gcs { Ok(result?) } + #[tracing::instrument( + name = "gcs.check", + skip(self), + fields(tenant = ?tenant_id, ok = tracing::field::Empty), + err + )] async fn check(&self, tenant_id: &Option) -> Result<(), ObjectStorageError> { let result = self .client .head(&to_object_store_path(&parseable_json_path())) .await; + tracing::Span::current().record("ok", result.is_ok()); let tenant = tenant_id.as_deref().unwrap_or(DEFAULT_TENANT); increment_object_store_calls_by_date("HEAD", &Utc::now().date_naive().to_string(), tenant); diff --git a/src/storage/metrics_layer.rs b/src/storage/metrics_layer.rs index ac82e43d6..297e65eec 100644 --- a/src/storage/metrics_layer.rs +++ b/src/storage/metrics_layer.rs @@ -31,7 +31,34 @@ use object_store::{ Result as ObjectStoreResult, path::Path, }; -use crate::metrics::STORAGE_REQUEST_RESPONSE_TIME; +use crate::metrics::{STORAGE_REQUEST_RESPONSE_TIME, STORAGE_REQUESTS_INFLIGHT}; + +/// RAII guard that increments the in-flight gauge on construction and +/// decrements on drop. Handles early returns, panics, and dropped futures. +struct InflightGuard { + provider: String, + method: &'static str, +} + +impl InflightGuard { + fn new(provider: &str, method: &'static str) -> Self { + STORAGE_REQUESTS_INFLIGHT + .with_label_values(&[provider, method]) + .inc(); + Self { + provider: provider.to_string(), + method, + } + } +} + +impl Drop for InflightGuard { + fn drop(&mut self) { + STORAGE_REQUESTS_INFLIGHT + .with_label_values(&[&self.provider, self.method]) + .dec(); + } +} // Public helper function to map object_store errors to HTTP status codes pub fn error_to_status_code(err: &object_store::Error) -> &'static str { @@ -91,6 +118,7 @@ impl ObjectStore for MetricLayer { payload: PutPayload, opts: PutOptions, ) -> ObjectStoreResult { + let _guard = InflightGuard::new(&self.provider, "PUT"); let time = time::Instant::now(); let put_result = self.inner.put_opts(location, payload, opts).await; let elapsed = time.elapsed().as_secs_f64(); @@ -111,6 +139,7 @@ impl ObjectStore for MetricLayer { location: &Path, opts: PutMultipartOptions, ) -> ObjectStoreResult> { + let _guard = InflightGuard::new(&self.provider, "PUT_MULTIPART"); let time = time::Instant::now(); let result = self.inner.put_multipart_opts(location, opts).await; let elapsed = time.elapsed().as_secs_f64(); @@ -127,6 +156,7 @@ impl ObjectStore for MetricLayer { } async fn get_opts(&self, location: &Path, options: GetOptions) -> ObjectStoreResult { + let _guard = InflightGuard::new(&self.provider, "GET"); let time = time::Instant::now(); let result = self.inner.get_opts(location, options).await; let elapsed = time.elapsed().as_secs_f64(); @@ -147,6 +177,7 @@ impl ObjectStore for MetricLayer { location: &Path, ranges: &[Range], ) -> ObjectStoreResult> { + let _guard = InflightGuard::new(&self.provider, "GET_RANGES"); let time = time::Instant::now(); let result = self.inner.get_ranges(location, ranges).await; let elapsed = time.elapsed().as_secs_f64(); @@ -170,6 +201,7 @@ impl ObjectStore for MetricLayer { } fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, ObjectStoreResult> { + let _guard = InflightGuard::new(&self.provider, "LIST"); let time = time::Instant::now(); let inner = self.inner.list(prefix); let res = StreamMetricWrapper { @@ -177,6 +209,7 @@ impl ObjectStore for MetricLayer { provider: self.provider.clone(), method: "LIST", status: "200", + _guard, inner, }; Box::pin(res) @@ -187,6 +220,7 @@ impl ObjectStore for MetricLayer { prefix: Option<&Path>, offset: &Path, ) -> BoxStream<'static, ObjectStoreResult> { + let _guard = InflightGuard::new(&self.provider, "LIST_OFFSET"); let time = time::Instant::now(); let inner = self.inner.list_with_offset(prefix, offset); let res = StreamMetricWrapper { @@ -194,6 +228,7 @@ impl ObjectStore for MetricLayer { provider: self.provider.clone(), method: "LIST_OFFSET", status: "200", + _guard, inner, }; @@ -201,6 +236,7 @@ impl ObjectStore for MetricLayer { } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> ObjectStoreResult { + let _guard = InflightGuard::new(&self.provider, "LIST_DELIM"); let time = time::Instant::now(); let result = self.inner.list_with_delimiter(prefix).await; let elapsed = time.elapsed().as_secs_f64(); @@ -222,6 +258,7 @@ impl ObjectStore for MetricLayer { to: &Path, options: CopyOptions, ) -> ObjectStoreResult<()> { + let _guard = InflightGuard::new(&self.provider, "COPY"); let time = time::Instant::now(); let result = self.inner.copy_opts(from, to, options).await; let elapsed = time.elapsed().as_secs_f64(); @@ -243,6 +280,7 @@ impl ObjectStore for MetricLayer { to: &Path, options: RenameOptions, ) -> ObjectStoreResult<()> { + let _guard = InflightGuard::new(&self.provider, "RENAME"); let time = time::Instant::now(); let result = self.inner.rename_opts(from, to, options).await; let elapsed = time.elapsed().as_secs_f64(); @@ -264,6 +302,7 @@ struct StreamMetricWrapper<'a, T> { provider: String, method: &'static str, status: &'static str, + _guard: InflightGuard, inner: BoxStream<'a, T>, } diff --git a/src/storage/object_storage.rs b/src/storage/object_storage.rs index 3e5ccef03..b67cf18f9 100644 --- a/src/storage/object_storage.rs +++ b/src/storage/object_storage.rs @@ -98,6 +98,11 @@ pub(crate) struct UploadResult { } /// Handles the upload of a single parquet file +#[tracing::instrument( + name = "object_store.upload_single_parquet", + skip(store, schema), + fields(stream = %stream_name, tenant = ?tenant_id, path = %path.display(), key = %stream_relative_path) +)] async fn upload_single_parquet_file( store: Arc, path: std::path::PathBuf, @@ -240,6 +245,11 @@ async fn calculate_stats_if_enabled( } /// Validates that a parquet file uploaded to object storage matches the staging file size +#[tracing::instrument( + name = "object_store.validate_uploaded_parquet", + skip(store), + fields(stream = %stream_name, tenant = ?tenant_id, key = %stream_relative_path, expected_size) +)] async fn validate_uploaded_parquet_file( store: &Arc, stream_relative_path: &str, @@ -1004,6 +1014,12 @@ pub trait ObjectStorage: Debug + Send + Sync + 'static { // pick a better name fn get_bucket_name(&self) -> String; + #[tracing::instrument( + name = "object_store.upload_files_from_staging", + skip(self), + fields(stream = %stream_name, tenant = ?tenant_id), + err + )] async fn upload_files_from_staging( &self, stream_name: &str, @@ -1033,6 +1049,18 @@ pub trait ObjectStorage: Debug + Send + Sync + 'static { } /// Processes parquet files concurrently and returns stats status and manifest files +#[tracing::instrument( + name = "object_store.process_parquet_files", + skip(upload_context), + fields( + stream = %stream_name, + tenant = ?tenant_id, + parquet_count = tracing::field::Empty, + total_size_bytes = tracing::field::Empty, + smallest_date = tracing::field::Empty, + largest_date = tracing::field::Empty, + ) +)] async fn process_parquet_files( upload_context: &UploadContext, stream_name: &str, @@ -1059,6 +1087,27 @@ async fn process_parquet_files( ret }; + let mut total_size: u64 = 0; + let mut min_dt: Option> = None; + let mut max_dt: Option> = None; + for path in &parquet_paths { + if let Ok(meta) = path.metadata() { + total_size = total_size.saturating_add(meta.len()); + } + if let Ok(dt) = extract_datetime_from_parquet_path_regex(path) { + min_dt = Some(min_dt.map_or(dt, |cur| cur.min(dt))); + max_dt = Some(max_dt.map_or(dt, |cur| cur.max(dt))); + } + } + let span = tracing::Span::current(); + span.record("parquet_count", parquet_paths.len()); + span.record("total_size_bytes", total_size); + if let Some(dt) = min_dt { + span.record("smallest_date", tracing::field::display(dt)); + } + if let Some(dt) = max_dt { + span.record("largest_date", tracing::field::display(dt)); + } // Spawn upload tasks for each parquet file for path in parquet_paths { spawn_parquet_upload_task( diff --git a/src/storage/s3.rs b/src/storage/s3.rs index 98ca91036..bb8135287 100644 --- a/src/storage/s3.rs +++ b/src/storage/s3.rs @@ -364,6 +364,12 @@ impl S3 { } } + #[tracing::instrument( + name = "s3.parallel_download", + skip(self, partial_path), + fields(path = %path, tenant = ?tenant_id, total_bytes = tracing::field::Empty, chunks = tracing::field::Empty), + err + )] async fn _parallel_download_inner( &self, path: &RelativePath, @@ -392,6 +398,9 @@ impl S3 { .map(|s| s..(s + chunk).min(total)) .collect(); let chunk_count = ranges.len() as u64; + tracing::Span::current() + .record("total_bytes", total) + .record("chunks", chunk_count); let client = Arc::new(self.client.clone()); let semaphore = Arc::new(tokio::sync::Semaphore::new(concurrency as usize)); @@ -436,6 +445,12 @@ impl S3 { Ok(()) } + #[tracing::instrument( + name = "s3.get_object", + skip(self), + fields(path = %path, tenant = ?tenant_id, bytes = tracing::field::Empty), + err + )] async fn _get_object( &self, path: &RelativePath, @@ -452,6 +467,7 @@ impl S3 { match resp { Ok(resp) => { let body = resp.bytes().await?; + tracing::Span::current().record("bytes", body.len()); increment_files_scanned_in_object_store_calls_by_date( "GET", 1, @@ -470,6 +486,11 @@ impl S3 { } } + #[tracing::instrument( + name = "s3.put_object", + skip(self, resource), + fields(path = %path, tenant = ?tenant_id, bytes = resource.content_length()) + )] async fn _put_object( &self, path: &RelativePath, @@ -497,6 +518,12 @@ impl S3 { } } + #[tracing::instrument( + name = "s3.delete_prefix", + skip(self), + fields(prefix = %key, tenant = ?tenant_id, deleted = tracing::field::Empty, failed = tracing::field::Empty), + err + )] async fn _delete_prefix( &self, key: &str, @@ -531,6 +558,9 @@ impl S3 { } let total_files = files_deleted + failed_deletes; + tracing::Span::current() + .record("deleted", files_deleted) + .record("failed", failed_deletes); increment_files_scanned_in_object_store_calls_by_date( "LIST", total_files, @@ -553,6 +583,12 @@ impl S3 { Ok(()) } + #[tracing::instrument( + name = "s3.list_dates", + skip(self), + fields(stream = %stream, tenant = ?tenant_id, dates = tracing::field::Empty), + err + )] async fn _list_dates( &self, stream: &str, @@ -595,10 +631,16 @@ impl S3 { .filter_map(|path| path.as_ref().strip_prefix(&prefix)) .map(String::from) .collect(); + tracing::Span::current().record("dates", dates.len()); Ok(dates) } + #[tracing::instrument( + name = "s3.upload_file", + skip(self), + fields(key = %key, path = %path.display(), tenant = ?tenant_id) + )] async fn _upload_file( &self, key: &str, @@ -628,6 +670,11 @@ impl S3 { } } + #[tracing::instrument( + name = "s3.upload_multipart", + skip(self), + fields(key = %key, path = %path.display(), tenant = ?tenant_id, total_bytes = tracing::field::Empty, parts = tracing::field::Empty) + )] async fn _upload_multipart( &self, key: &RelativePath, @@ -684,6 +731,9 @@ impl S3 { let has_final_partial_part = !total_size.is_multiple_of(min_multipart_size); let num_full_parts = total_size / min_multipart_size; let total_parts = num_full_parts + if has_final_partial_part { 1 } else { 0 }; + tracing::Span::current() + .record("total_bytes", total_size) + .record("parts", total_parts); // Upload each part with metrics for part_number in 0..(total_parts) { @@ -767,6 +817,12 @@ impl ObjectStorage for S3 { self._upload_multipart(key, path, tenant_id).await } + #[tracing::instrument( + name = "s3.head", + skip(self), + fields(path = %path, tenant = ?tenant_id), + err + )] async fn head( &self, path: &RelativePath, @@ -929,6 +985,12 @@ impl ObjectStorage for S3 { Ok(()) } + #[tracing::instrument( + name = "s3.delete_object", + skip(self), + fields(path = %path, tenant = ?tenant_id), + err + )] async fn delete_object( &self, path: &RelativePath, @@ -953,12 +1015,25 @@ impl ObjectStorage for S3 { Ok(result?) } + #[tracing::instrument( + name = "s3.check", + skip(self), + fields(tenant = ?tenant_id), + err + )] + #[tracing::instrument( + name = "s3.check", + skip(self), + fields(tenant = ?tenant_id, ok = tracing::field::Empty), + err + )] async fn check(&self, tenant_id: &Option) -> Result<(), ObjectStorageError> { let tenant_str = tenant_id.as_deref().unwrap_or(DEFAULT_TENANT); let result = self .client .head(&to_object_store_path(&parseable_json_path())) .await; + tracing::Span::current().record("ok", result.is_ok()); increment_object_store_calls_by_date( "HEAD", &Utc::now().date_naive().to_string(), diff --git a/src/sync.rs b/src/sync.rs index a9c5d4485..14a88ab38 100644 --- a/src/sync.rs +++ b/src/sync.rs @@ -277,34 +277,42 @@ pub fn local_sync() -> ( /// local and object store sync at the start of the server #[tokio::main(flavor = "current_thread")] pub async fn sync_start() -> anyhow::Result<()> { - // Monitor local sync duration at startup - monitor_task_duration( - "startup_local_sync", - Duration::from_secs(PARSEABLE.options.local_sync_threshold), - || async { - let mut local_sync_joinset = JoinSet::new(); - PARSEABLE - .streams - .flush_and_convert(&mut local_sync_joinset, true, false); - while let Some(res) = local_sync_joinset.join_next().await { - log_join_result(res, "flush and convert"); - } - }, - ) + async { + // Monitor local sync duration at startup + monitor_task_duration( + "startup_local_sync", + Duration::from_secs(PARSEABLE.options.local_sync_threshold), + || async { + let mut local_sync_joinset = JoinSet::new(); + PARSEABLE + .streams + .flush_and_convert(&mut local_sync_joinset, true, false); + while let Some(res) = local_sync_joinset.join_next().await { + log_join_result(res, "flush and convert"); + } + }, + ) + .await; + } + .instrument(info_span!("local_sync_startup")) .await; - // Monitor object store sync duration at startup - monitor_task_duration( - "startup_object_store_sync", - Duration::from_secs(PARSEABLE.options.object_store_sync_threshold), - || async { - let mut object_store_joinset = JoinSet::new(); - sync_all_streams(&mut object_store_joinset); - while let Some(res) = object_store_joinset.join_next().await { - log_join_result(res, "object store sync"); - } - }, - ) + async { + // Monitor object store sync duration at startup + monitor_task_duration( + "startup_object_store_sync", + Duration::from_secs(PARSEABLE.options.object_store_sync_threshold), + || async { + let mut object_store_joinset = JoinSet::new(); + sync_all_streams(&mut object_store_joinset); + while let Some(res) = object_store_joinset.join_next().await { + log_join_result(res, "object store sync"); + } + }, + ) + .await; + } + .instrument(info_span!("object_store_sync_startup")) .await; Ok(()) diff --git a/src/telemetry.rs b/src/telemetry.rs index 4e4d7dfdb..1a2212bda 100644 --- a/src/telemetry.rs +++ b/src/telemetry.rs @@ -49,7 +49,7 @@ const OTEL_EXPORTER_OTLP_PROTOCOL: &str = "OTEL_EXPORTER_OTLP_PROTOCOL"; /// /// Returns \`None\` when \`OTEL_EXPORTER_OTLP_ENDPOINT\` or `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` is not set (OTEL disabled). /// The caller must call \`provider.shutdown()\` before process exit. -pub fn init_tracing() -> Option { +pub fn init_tracing(service: &'static str) -> Option { // Only used to decide whether OTEL is enabled; the SDK reads it again // from env to build the exporter (which also appends /v1/traces for HTTP). if std::env::var(OTEL_EXPORTER_OTLP_ENDPOINT).is_err() @@ -101,7 +101,7 @@ pub fn init_tracing() -> Option { // migration tables to rewrite attribute names across semconv versions — // so even if upstream semconv drifts, emitted telemetry remains translatable. let resource = Resource::builder_empty() - .with_service_name("parseable") + .with_service_name(service) .with_schema_url( std::iter::empty::(), "https://opentelemetry.io/schemas/1.56.0", From a61b67b4be7553f1687c11356b83f400473da0a8 Mon Sep 17 00:00:00 2001 From: Anant Vindal Date: Tue, 12 May 2026 10:23:18 +0530 Subject: [PATCH 16/18] Updates: reduce object-store calls, limitstore for metastore --- src/hottier.rs | 266 ++++++++++++------ src/metastore/metastore_traits.rs | 8 +- .../metastores/object_store_metastore.rs | 29 +- src/storage/azure_blob.rs | 3 +- src/storage/gcs.rs | 6 +- src/storage/s3.rs | 8 +- 6 files changed, 213 insertions(+), 107 deletions(-) diff --git a/src/hottier.rs b/src/hottier.rs index fae9f88e6..1e98d081b 100644 --- a/src/hottier.rs +++ b/src/hottier.rs @@ -18,7 +18,7 @@ use datafusion::common::HashSet; use std::{ - collections::{BTreeMap, HashMap}, + collections::HashMap, io, path::{Path, PathBuf}, sync::Arc, @@ -34,7 +34,7 @@ use crate::{ utils::{extract_datetime, human_size::bytes_to_human_size}, validator::error::HotTierValidationError, }; -use chrono::NaiveDate; +use chrono::{DateTime, NaiveDate, Timelike, Utc}; use futures::{StreamExt, TryStreamExt, stream::FuturesUnordered}; use futures_util::TryFutureExt; use object_store::{ObjectStoreExt, local::LocalFileSystem}; @@ -47,6 +47,15 @@ use tokio::fs::{self, DirEntry}; use tokio_stream::wrappers::ReadDirStream; use tracing::{Instrument, error, info}; +/// Floor a timestamp to the start of its minute (seconds + sub-second zeroed). +/// Used to produce a stable per-tick anchor so all spans within one tick share +/// the same cutoff value. +fn floor_to_minute(ts: DateTime) -> DateTime { + ts.with_second(0) + .and_then(|t| t.with_nanosecond(0)) + .unwrap_or(ts) +} + pub const STREAM_HOT_TIER_FILENAME: &str = ".hot_tier.json"; pub const MIN_STREAM_HOT_TIER_SIZE_BYTES: u64 = 10737418240; // 10 GiB pub const INTERNAL_STREAM_HOT_TIER_SIZE_BYTES: u64 = 10485760; //10 MiB @@ -622,6 +631,7 @@ impl HotTierManager { let this: &'static HotTierManager = self; let startup_span = tracing::info_span!("hottier.startup.bootstrap"); + let span = startup_span.clone(); tokio::spawn( async move { // pstats hot tier may need to be created on boot before any tasks @@ -637,7 +647,12 @@ impl HotTierManager { for tenant_id in tenants { for stream in PARSEABLE.streams.list(&tenant_id) { if this.check_stream_hot_tier_exists(&stream, &tenant_id) { - this.spawn_stream_tasks(stream, tenant_id.clone()).await; + let tenant_id = tenant_id.clone(); + + tokio::spawn(async move { + this.spawn_stream_tasks(stream, tenant_id).await; + }.instrument(span.clone())); + tokio::time::sleep(Duration::from_secs(8)).await; } else { // check for potential orphan directory on disk let path = if let Some(tenant_id) = tenant_id.as_ref() { @@ -657,7 +672,7 @@ impl HotTierManager { } } } - .instrument(startup_span), + .instrument(startup_span.clone()), ); Ok(()) } @@ -691,16 +706,18 @@ impl HotTierManager { let t = tenant_id.clone(); let latest = tokio::spawn(async move { loop { + let anchor = floor_to_minute(Utc::now()); let tick_span = tracing::info_span!( "hottier.tick", stream = %s, tenant = ?t, - phase = "latest" + phase = "latest", + anchor = %anchor ); async { info!("stream tick fired"); if let Err(err) = self - .process_stream(s.clone(), t.clone(), SyncPhase::Latest) + .process_stream(s.clone(), t.clone(), SyncPhase::Latest, anchor) .await { error!("latest sync error: {err:?}"); @@ -716,16 +733,18 @@ impl HotTierManager { let t = tenant_id.clone(); let historic = tokio::spawn(async move { loop { + let anchor = floor_to_minute(Utc::now()); let tick_span = tracing::info_span!( "hottier.tick", stream = %s, tenant = ?t, - phase = "historic" + phase = "historic", + anchor = %anchor ); async { info!("stream tick fired"); if let Err(err) = self - .process_stream(s.clone(), t.clone(), SyncPhase::Historic) + .process_stream(s.clone(), t.clone(), SyncPhase::Historic, anchor) .await { error!("historic sync error: {err:?}"); @@ -766,7 +785,7 @@ impl HotTierManager { #[tracing::instrument( name = "hottier.process_stream", skip(self), - fields(stream = %stream, tenant = ?tenant_id, phase = ?phase), + fields(stream = %stream, tenant = ?tenant_id, phase = ?phase, anchor = %anchor), err )] async fn process_stream( @@ -774,44 +793,10 @@ impl HotTierManager { stream: String, tenant_id: Option, phase: SyncPhase, + anchor: DateTime, ) -> Result<(), HotTierError> { - info!( - stream = %stream, - tenant = ?tenant_id, - phase = ?phase, - "fetching manifest from metastore" - ); - let mut s3_manifest_file_list = PARSEABLE - .metastore - .get_all_manifest_files(&stream, &tenant_id) - .await - .map_err(|e| { - error!( - stream = %stream, - tenant = ?tenant_id, - phase = ?phase, - error = ?e - ); - HotTierError::ObjectStorageError(ObjectStorageError::MetastoreError(Box::new( - e.to_detail(), - ))) - })?; - let date_count = s3_manifest_file_list.len(); - let total_files: usize = s3_manifest_file_list - .values() - .map(|m| m.iter().map(|x| x.files.len()).sum::()) - .sum(); - info!( - stream = %stream, - tenant = ?tenant_id, - phase = ?phase, - dates = date_count, - files = total_files, - "manifest fetched" - ); - let stream_start = std::time::Instant::now(); - self.process_manifest(&stream, &mut s3_manifest_file_list, &tenant_id, phase) + self.process_manifest(&stream, &tenant_id, phase, anchor) .await .map_err(|e| { error!( @@ -833,34 +818,123 @@ impl HotTierManager { Ok(()) } - /// process the hot tier files for the date for the stream - /// collect all manifests from metastore for the date, sort the parquet file list - /// in order to download the latest files first - /// download the parquet files if not present in hot tier directory + /// process the hot tier files for the stream + /// Determine the candidate dates for the current phase, fetch only those + /// manifests from the metastore, build a work list sorted newest-first by + /// file timestamp, then download via the existing reserve/commit flow. #[tracing::instrument( name = "hottier.process_manifest", - skip(self, manifest_files_to_download), - fields(stream = %stream, tenant = ?tenant_id, phase = ?phase, dates = manifest_files_to_download.len()) + skip(self), + fields( + stream = %stream, + tenant = ?tenant_id, + phase = ?phase, + anchor = %anchor, + candidate_dates = tracing::field::Empty, + work_count = tracing::field::Empty, + total_bytes = tracing::field::Empty, + ), + err )] async fn process_manifest( &self, stream: &str, - manifest_files_to_download: &mut BTreeMap>, tenant_id: &Option, phase: SyncPhase, + anchor: DateTime, ) -> Result<(), HotTierError> { - if manifest_files_to_download.is_empty() { + let latest_minutes = PARSEABLE.options.hot_tier_latest_minutes; + let latest_window = chrono::Duration::minutes(latest_minutes.try_into().unwrap()); + let historic_cutoff = anchor - latest_window; + let historic_cutoff_naive = historic_cutoff.naive_utc(); + let today_date_key = format!("date={}", anchor.date_naive()); + + // Determine which date keys to fetch from the metastore this tick. + let candidate_dates: Vec = match phase { + SyncPhase::Latest => { + // Dates covered by [historic_cutoff, anchor]. Usually just today, + // or today + yesterday if window crosses midnight. + let start = historic_cutoff.date_naive(); + let end = anchor.date_naive(); + let mut out = Vec::new(); + let mut d = start; + while d <= end { + out.push(format!("date={d}")); + d = d.succ_opt().unwrap_or(d); + if out.len() > 365 { + break; + } + } + out + } + SyncPhase::Historic => { + let local = self + .fetch_hot_tier_dates(stream, tenant_id) + .await + .unwrap_or_default() + .into_iter() + .map(|d| format!("date={d}")) + .collect::>(); + let s3 = PARSEABLE + .storage() + .get_object_store() + .list_dates(stream, tenant_id) + .await + .unwrap_or_default(); + + let mut union: std::collections::BTreeSet = local.into_iter().collect(); + union.extend(s3); + + let mut out = Vec::new(); + for date_key in union { + // drop today and anything >= today (Latest handles those) + if date_key.as_str() >= today_date_key.as_str() { + continue; + } + if self.is_locally_complete(stream, &date_key, tenant_id).await { + continue; + } + out.push(date_key); + } + // Newest-first: discover newest missing past date first. + out.sort(); + out.reverse(); + out + } + }; + tracing::Span::current().record("candidate_dates", candidate_dates.len()); + + if candidate_dates.is_empty() { + info!( + stream = %stream, + tenant = ?tenant_id, + phase = ?phase, + "no candidate dates this tick" + ); return Ok(()); } - let latest_minutes = PARSEABLE.options.hot_tier_latest_minutes; - let cutoff = chrono::Utc::now().naive_utc() - - chrono::Duration::minutes(latest_minutes.try_into().unwrap()); + let s3_manifests = PARSEABLE + .metastore + .get_manifest_files_for_dates(stream, tenant_id, &candidate_dates) + .await + .map_err(|e| { + error!( + stream = %stream, + tenant = ?tenant_id, + phase = ?phase, + error = ?e, + "manifest fetch failed" + ); + HotTierError::ObjectStorageError(ObjectStorageError::MetastoreError(Box::new( + e.to_detail(), + ))) + })?; - // Build flat work list: (date, file, local_path) ordered latest-date-first, - // and within a date by descending file_path (matches old `pop()` order). - let mut work: Vec<(NaiveDate, File, PathBuf)> = Vec::new(); - for (str_date, manifest_files) in manifest_files_to_download.iter().rev() { + // Build flat work list across all candidate dates: keep only files + // matching this phase's cutoff and not already on disk. + let mut work: Vec<(NaiveDate, chrono::NaiveDateTime, File, PathBuf)> = Vec::new(); + for (str_date, manifest_files) in s3_manifests.iter() { let date = match NaiveDate::parse_from_str(str_date.trim_start_matches("date="), "%Y-%m-%d") { Ok(d) => d, @@ -870,30 +944,36 @@ impl HotTierManager { } }; - let mut combined = Manifest::default(); for storage_manifest in manifest_files { - combined.files.extend(storage_manifest.files.clone()); - } - combined.files.sort_by_key(|f| f.file_path.clone()); - while let Some(parquet_file) = combined.files.pop() { - let parquet_path = self.hot_tier_path.join(&parquet_file.file_path); - if parquet_path.exists() { - continue; - } - let dt = extract_datetime(&parquet_file.file_path); - let is_latest = dt.map(|d| d >= cutoff).unwrap_or(false); - let keep = match phase { - SyncPhase::Latest => is_latest, - SyncPhase::Historic => !is_latest, - }; - if keep { - work.push((date, parquet_file, parquet_path)); + for parquet_file in &storage_manifest.files { + let parquet_path = self.hot_tier_path.join(&parquet_file.file_path); + if parquet_path.exists() { + continue; + } + let dt = match extract_datetime(&parquet_file.file_path) { + Some(d) => d, + None => continue, + }; + let is_latest = dt >= historic_cutoff_naive; + let keep = match phase { + SyncPhase::Latest => is_latest, + SyncPhase::Historic => !is_latest, + }; + if keep { + work.push((date, dt, parquet_file.clone(), parquet_path)); + } } } } + // Newest first by file timestamp. + work.sort_by_key(|b| std::cmp::Reverse(b.1)); + let work_count = work.len(); - let total_bytes: u64 = work.iter().map(|(_, f, _)| f.file_size).sum(); + let total_bytes: u64 = work.iter().map(|(_, _, f, _)| f.file_size).sum(); + tracing::Span::current() + .record("work_count", work_count) + .record("total_bytes", total_bytes); if work.is_empty() { info!( stream = %stream, @@ -923,7 +1003,7 @@ impl HotTierManager { let tenant_owned = tenant_id.clone(); let results: Vec> = futures::stream::iter(work) - .map(|(date, file, parquet_path)| { + .map(|(date, _dt, file, parquet_path)| { let state = state.clone(); let stream = stream_owned.clone(); let tenant_id = tenant_owned.clone(); @@ -1261,6 +1341,34 @@ impl HotTierManager { } } + /// A past date is treated as fully synced when its on-disk hottier + /// manifest exists and lists at least one parquet. Used by the Historic + /// phase to skip the metastore fetch for dates we already have data for. + /// `date_key` is the partition directory name (e.g. "date=2026-05-12"). + async fn is_locally_complete( + &self, + stream: &str, + date_key: &str, + tenant_id: &Option, + ) -> bool { + let date_dir = if let Some(tenant) = tenant_id.as_ref() { + self.hot_tier_path.join(tenant).join(stream).join(date_key) + } else { + self.hot_tier_path.join(stream).join(date_key) + }; + let manifest_path = date_dir.join("hottier.manifest.json"); + if !manifest_path.exists() { + return false; + } + match fs::read(&manifest_path).await { + Ok(bytes) => match serde_json::from_slice::(&bytes) { + Ok(m) => !m.files.is_empty(), + Err(_) => false, + }, + Err(_) => false, + } + } + /// Returns the list of manifest files present in hot tier directory for the stream pub async fn get_hot_tier_manifest_files( &self, diff --git a/src/metastore/metastore_traits.rs b/src/metastore/metastore_traits.rs index 79745ecf8..0df0e8a5d 100644 --- a/src/metastore/metastore_traits.rs +++ b/src/metastore/metastore_traits.rs @@ -248,11 +248,15 @@ pub trait Metastore: std::fmt::Debug + Send + Sync { tenant_id: &Option, ) -> Result, MetastoreError>; - /// manifest - async fn get_all_manifest_files( + /// Fetch manifests only for the explicitly requested date keys + /// (e.g. `["date=2026-05-12"]`). Skips the top-level LIST to discover + /// dates — callers must already know which dates to query (e.g. via + /// the local hot-tier dir + a single object-store LIST when needed). + async fn get_manifest_files_for_dates( &self, stream_name: &str, tenant_id: &Option, + dates: &[String], ) -> Result>, MetastoreError>; async fn get_manifest( &self, diff --git a/src/metastore/metastores/object_store_metastore.rs b/src/metastore/metastores/object_store_metastore.rs index 904bcca24..5adf93cef 100644 --- a/src/metastore/metastores/object_store_metastore.rs +++ b/src/metastore/metastores/object_store_metastore.rs @@ -929,34 +929,24 @@ impl Metastore for ObjectStoreMetastore { } /// Fetch all `Manifest` files - async fn get_all_manifest_files( + async fn get_manifest_files_for_dates( &self, stream_name: &str, tenant_id: &Option, + dates: &[String], ) -> Result>, MetastoreError> { let mut result_file_list: BTreeMap> = BTreeMap::new(); - let root = if let Some(tenant) = tenant_id { - format!("{tenant}/{stream_name}") - } else { - stream_name.into() - }; - let resp = self.storage.list_with_delimiter(Some(root.into())).await?; - - let dates = resp - .common_prefixes - .iter() - .flat_map(|path| path.parts()) - .filter(|name| name.as_ref() != stream_name && name.as_ref() != STREAM_ROOT_DIRECTORY) - .map(|name| name.as_ref().to_string()) - .collect::>(); - for date in dates { let date_path = if let Some(tenant) = tenant_id { - object_store::path::Path::from(format!("{}/{}/{}", tenant, stream_name, &date)) + object_store::path::Path::from(format!("{}/{}/{}", tenant, stream_name, date)) } else { - object_store::path::Path::from(format!("{}/{}", stream_name, &date)) + object_store::path::Path::from(format!("{}/{}", stream_name, date)) + }; + let resp = match self.storage.list_with_delimiter(Some(date_path)).await { + Ok(r) => r, + Err(ObjectStorageError::NoSuchKey(_)) => continue, + Err(e) => return Err(e.into()), }; - let resp = self.storage.list_with_delimiter(Some(date_path)).await?; let manifest_paths: Vec = resp .objects @@ -970,7 +960,6 @@ impl Metastore for ObjectStoreMetastore { .storage .get_object(&RelativePathBuf::from(path), tenant_id) .await?; - result_file_list .entry(date.clone()) .or_default() diff --git a/src/storage/azure_blob.rs b/src/storage/azure_blob.rs index aab307c54..1661d16fc 100644 --- a/src/storage/azure_blob.rs +++ b/src/storage/azure_blob.rs @@ -188,6 +188,7 @@ impl ObjectStorageProvider for AzureBlobConfig { let azure = self.get_default_builder().build().unwrap(); // limit objectstore to a concurrent request limit let azure = LimitStore::new(azure, super::MAX_OBJECT_STORE_REQUESTS); + let azure = MetricLayer::new(azure, "azure"); Arc::new(BlobStore { client: Arc::new(azure), account: self.account.clone(), @@ -205,7 +206,7 @@ impl ObjectStorageProvider for AzureBlobConfig { // object store such as S3 and Azure Blob #[derive(Debug)] pub struct BlobStore { - client: Arc>, + client: Arc>>, account: String, container: String, root: StorePath, diff --git a/src/storage/gcs.rs b/src/storage/gcs.rs index fa3e8c550..56bcc71a9 100644 --- a/src/storage/gcs.rs +++ b/src/storage/gcs.rs @@ -150,7 +150,9 @@ impl ObjectStorageProvider for GcsConfig { fn construct_client(&self) -> Arc { let gcs = self.get_default_builder().build().unwrap(); - + // limit objectstore to a concurrent request limit + let gcs = LimitStore::new(gcs, super::MAX_OBJECT_STORE_REQUESTS); + let gcs = MetricLayer::new(gcs, "gcs"); Arc::new(Gcs { client: Arc::new(gcs), bucket: self.bucket_name.clone(), @@ -172,7 +174,7 @@ impl ObjectStorageProvider for GcsConfig { #[derive(Debug)] pub struct Gcs { - client: Arc, + client: Arc>>, bucket: String, root: StorePath, } diff --git a/src/storage/s3.rs b/src/storage/s3.rs index bb8135287..dd3013952 100644 --- a/src/storage/s3.rs +++ b/src/storage/s3.rs @@ -318,9 +318,11 @@ impl ObjectStorageProvider for S3Config { fn construct_client(&self) -> Arc { let s3 = self.get_default_builder().build().unwrap(); - + // limit objectstore to a concurrent request limit + let s3 = LimitStore::new(s3, super::MAX_OBJECT_STORE_REQUESTS); + let s3 = MetricLayer::new(s3, "s3"); Arc::new(S3 { - client: s3, + client: Arc::new(s3), bucket: self.bucket_name.clone(), root: StorePath::from(""), }) @@ -333,7 +335,7 @@ impl ObjectStorageProvider for S3Config { #[derive(Debug)] pub struct S3 { - client: AmazonS3, + client: Arc>>, bucket: String, root: StorePath, } From 3e51ed5c7111a5e1dafbc647ac042ffc5c2b25fb Mon Sep 17 00:00:00 2001 From: Anant Vindal Date: Tue, 12 May 2026 15:08:47 +0530 Subject: [PATCH 17/18] try hottier abort --- src/handlers/http/modal/mod.rs | 5 +++++ src/hottier.rs | 16 ++++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/handlers/http/modal/mod.rs b/src/handlers/http/modal/mod.rs index d7a936c5f..d5cac5049 100644 --- a/src/handlers/http/modal/mod.rs +++ b/src/handlers/http/modal/mod.rs @@ -160,6 +160,11 @@ pub trait ParseableServer { // Shutdown resource monitor let _ = resource_shutdown_tx.send(()); + // Shutdown hottier + if let Some(ht_global) = HotTierManager::global() { + ht_global.abort_all().await; + } + // Initiate graceful shutdown info!("Graceful shutdown of HTTP server triggered"); srv_handle.stop(true).await; diff --git a/src/hottier.rs b/src/hottier.rs index 1e98d081b..8eba302fa 100644 --- a/src/hottier.rs +++ b/src/hottier.rs @@ -613,6 +613,16 @@ impl HotTierManager { Ok(path) } + #[tracing::instrument(name = "hottier.abort", skip(self))] + pub async fn abort_all(&self) { + let guard = self.tasks.write().await; + for (streamkey, task) in guard.iter() { + task.latest.abort(); + task.historic.abort(); + info!("aborted hot tier tasks for- {streamkey:?}"); + } + } + /// Discover hot-tier-enabled streams at boot and spawn a per-stream pair /// of (Latest, Historic) loops for each. New streams added later acquire /// their own loops via `spawn_stream_tasks` from the PUT hot-tier handler. @@ -770,12 +780,6 @@ impl HotTierManager { if let Some(t) = self.tasks.write().await.remove(&key) { t.latest.abort(); t.historic.abort(); - - // run these tasks till completion or till JoinError - // post this, we delete the folders so its better if these tasks complete - // and then deletion happens - let _ = t.latest.await; - let _ = t.historic.await; info!(stream = %stream, tenant = ?tenant_id, "aborted per-stream hot tier tasks"); } } From 411231eb71290ecebdce2219e218c7fe1d273590 Mon Sep 17 00:00:00 2001 From: Anant Vindal Date: Thu, 14 May 2026 17:42:19 +0530 Subject: [PATCH 18/18] fix: New runtime for hottier Running multiple parallel chunked downloads on the main runtime resulted in a slowdown in other incoming requests. This was mitigated by creating a new runtime for hottier downloads. --- src/cli.rs | 20 +- src/handlers/http/logstream.rs | 30 +- src/handlers/http/modal/mod.rs | 80 +-- .../http/modal/query/querier_logstream.rs | 4 +- src/handlers/http/modal/query_server.rs | 32 +- src/handlers/http/modal/server.rs | 32 +- src/hottier.rs | 661 ++++++++++++------ .../metastores/object_store_metastore.rs | 117 +++- src/parseable/mod.rs | 17 +- src/query/stream_schema_provider.rs | 4 +- src/storage/azure_blob.rs | 4 +- src/storage/gcs.rs | 7 +- src/storage/mod.rs | 2 +- src/storage/s3.rs | 18 +- src/telemetry.rs | 37 + src/utils/mod.rs | 12 +- 16 files changed, 685 insertions(+), 392 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index 07ae2e843..9803a3a46 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -306,6 +306,14 @@ pub struct Options { )] pub local_staging_path: PathBuf, + // trace modules + #[arg( + long, + env = "P_TRACE_MODULES", + help = "Comma separated values for modules to trace" + )] + pub trace_modules: Option, + #[arg( long = "hot-tier-path", env = "P_HOT_TIER_DIR", @@ -335,10 +343,11 @@ pub struct Options { #[arg( long = "hot-tier-files-per-stream-concurrency", env = "P_HOT_TIER_FILES_PER_STREAM_CONCURRENCY", + value_parser = clap::value_parser!(u32).range(1..), default_value = "4", help = "Number of concurrent parquet file downloads per stream during hot tier sync" )] - pub hot_tier_files_per_stream_concurrency: usize, + pub hot_tier_files_per_stream_concurrency: u32, #[arg( long = "hot-tier-latest-minutes", @@ -349,6 +358,15 @@ pub struct Options { )] pub hot_tier_latest_minutes: u64, + #[arg( + long = "hot-tier-per-tick-cap", + env = "P_HISTORIC_PER_TICK_CAP", + value_parser = clap::value_parser!(u32).range(10..), + default_value = "100", + help = "Maximum files to download per historic tick." + )] + pub historic_per_tick_cap: u32, + #[arg( long = "hot-tier-historic-sync-minutes", env = "P_HOT_TIER_HISTORIC_SYNC_MINUTES", diff --git a/src/handlers/http/logstream.rs b/src/handlers/http/logstream.rs index 4c38ffee2..2cfa6c08f 100644 --- a/src/handlers/http/logstream.rs +++ b/src/handlers/http/logstream.rs @@ -20,7 +20,7 @@ use self::error::StreamError; use super::cluster::utils::{IngestionStats, QueriedStats, StorageStats}; use super::query::update_schema_when_distributed; use crate::event::format::override_data_type; -use crate::hottier::{CURRENT_HOT_TIER_VERSION, HotTierManager, StreamHotTier}; +use crate::hottier::{CURRENT_HOT_TIER_VERSION, GLOBAL_HOTTIER, StreamHotTier}; use crate::metadata::SchemaVersion; use crate::metrics::{EVENTS_INGESTED_DATE, EVENTS_INGESTED_SIZE_DATE, EVENTS_STORAGE_SIZE_DATE}; use crate::parseable::{DEFAULT_TENANT, PARSEABLE, StreamNotFound}; @@ -47,7 +47,7 @@ use itertools::Itertools; use serde_json::{Value, json}; use std::fs; use std::sync::Arc; -use tracing::warn; +use tracing::{Instrument, warn}; pub async fn delete( req: HttpRequest, @@ -77,7 +77,7 @@ pub async fn delete( ) } - if let Some(hot_tier_manager) = HotTierManager::global() + if let Some(hot_tier_manager) = GLOBAL_HOTTIER.get() && hot_tier_manager.check_stream_hot_tier_exists(&stream_name, &tenant_id) { hot_tier_manager @@ -425,12 +425,13 @@ pub async fn put_stream_hot_tier( ) -> Result { let stream_name = logstream.into_inner(); let tenant_id = get_tenant_id_from_request(&req); - tracing::Span::current() + let current_span = tracing::Span::current(); + current_span .record("stream", tracing::field::display(&stream_name)) .record("tenant", tracing::field::debug(&tenant_id)); // For query mode, if the stream not found in memory map, - //check if it exists in the storage - //create stream and schema from storage + // check if it exists in the storage + // create stream and schema from storage if !PARSEABLE .check_or_load_stream(&stream_name, &tenant_id) .await @@ -449,9 +450,8 @@ pub async fn put_stream_hot_tier( validator::hot_tier(&hottier.size.to_string())?; - // TODO tenants stream.set_hot_tier(Some(hottier.clone())); - let Some(hot_tier_manager) = HotTierManager::global() else { + let Some(hot_tier_manager) = GLOBAL_HOTTIER.get() else { return Err(StreamError::HotTierNotEnabled(stream_name)); }; let existing_hot_tier_used_size = hot_tier_manager @@ -477,9 +477,13 @@ pub async fn put_stream_hot_tier( .metastore .put_stream_json(&stream_metadata, &stream_name, &tenant_id) .await?; + let stream = stream_name.clone(); + let tenant = tenant_id.clone(); hot_tier_manager - .spawn_stream_tasks(stream_name.clone(), tenant_id.clone()) + .spawn_stream_task(stream, tenant) + .instrument(current_span) .await; + Ok(( format!("hot tier set for stream {stream_name}"), StatusCode::OK, @@ -510,7 +514,7 @@ pub async fn get_stream_hot_tier( return Err(StreamNotFound(stream_name.clone()).into()); } - let Some(hot_tier_manager) = HotTierManager::global() else { + let Some(hot_tier_manager) = GLOBAL_HOTTIER.get() else { return Err(StreamError::HotTierNotEnabled(stream_name)); }; let meta = hot_tier_manager @@ -531,7 +535,8 @@ pub async fn delete_stream_hot_tier( ) -> Result { let stream_name = logstream.into_inner(); let tenant_id = get_tenant_id_from_request(&req); - tracing::Span::current() + let current_span = tracing::Span::current(); + current_span .record("stream", tracing::field::display(&stream_name)) .record("tenant", tracing::field::debug(&tenant_id)); // For query mode, if the stream not found in memory map, @@ -555,12 +560,13 @@ pub async fn delete_stream_hot_tier( }); } - let Some(hot_tier_manager) = HotTierManager::global() else { + let Some(hot_tier_manager) = GLOBAL_HOTTIER.get() else { return Err(StreamError::HotTierNotEnabled(stream_name)); }; hot_tier_manager .delete_hot_tier(&stream_name, &tenant_id) + .instrument(tracing::Span::current()) .await?; let mut stream_metadata: ObjectStoreFormat = serde_json::from_slice( diff --git a/src/handlers/http/modal/mod.rs b/src/handlers/http/modal/mod.rs index d5cac5049..dc4346efb 100644 --- a/src/handlers/http/modal/mod.rs +++ b/src/handlers/http/modal/mod.rs @@ -38,11 +38,11 @@ use crate::{ alerts::{ALERTS, get_alert_manager, target::TARGETS}, cli::Options, correlation::CORRELATIONS, - hottier::{HotTierManager, StreamHotTier}, + hottier::GLOBAL_HOTTIER, metastore::metastore_traits::MetastoreObject, oauth::{OAuthProvider, connect_oidc}, option::Mode, - parseable::{DEFAULT_TENANT, PARSEABLE}, + parseable::PARSEABLE, storage::{ObjectStorageProvider, PARSEABLE_ROOT_DIRECTORY}, users::{dashboards::DASHBOARDS, filters::FILTERS}, utils::get_node_id, @@ -161,8 +161,8 @@ pub trait ParseableServer { let _ = resource_shutdown_tx.send(()); // Shutdown hottier - if let Some(ht_global) = HotTierManager::global() { - ht_global.abort_all().await; + if let Some(htm) = GLOBAL_HOTTIER.get() { + htm.abort_all().await; } // Initiate graceful shutdown @@ -628,78 +628,6 @@ pub type IndexerMetadata = NodeMetadata; pub type QuerierMetadata = NodeMetadata; pub type PrismMetadata = NodeMetadata; -/// Initialize hot tier metadata files for streams that have hot tier configuration -/// in their stream metadata but don't have local hot tier metadata files yet. -/// This function is called once during query server startup. -#[tracing::instrument(name = "hottier.init_metadata_startup", skip(hot_tier_manager))] -pub async fn initialize_hot_tier_metadata_on_startup( - hot_tier_manager: &HotTierManager, -) -> anyhow::Result<()> { - // Collect hot tier configurations from streams before doing async operations - let hot_tier_configs: Vec<(String, Option, StreamHotTier)> = { - let tenants_guard = PARSEABLE.streams.read().unwrap(); - tenants_guard - .iter() - .flat_map(|(tenant_id, streams)| { - streams.iter().filter_map(|(stream_name, stream)| { - // Skip if hot tier metadata file already exists for this stream - let tenant_id = if tenant_id.eq(DEFAULT_TENANT) { - None - } else { - Some(tenant_id.clone()) - }; - if hot_tier_manager.check_stream_hot_tier_exists(stream_name, &tenant_id) { - return None; - } - - // Get the hot tier configuration from the in-memory stream metadata - stream - .get_hot_tier() - .map(|config| (stream_name.clone(), tenant_id, config.clone())) - }) - }) - .collect() - // let streams_guard = PARSEABLE.streams.read().unwrap(); - // streams_guard - // .iter() - // .filter_map(|(stream_name, stream)| { - // // Skip if hot tier metadata file already exists for this stream - // if hot_tier_manager.check_stream_hot_tier_exists(stream_name) { - // return None; - // } - - // // Get the hot tier configuration from the in-memory stream metadata - // stream - // .get_hot_tier() - // .map(|config| (stream_name.clone(), config)) - // }) - // .collect() - }; - - for (stream_name, tenant_id, hot_tier_config) in hot_tier_configs { - // Create the hot tier metadata file with the configuration from stream metadata - let mut hot_tier_metadata = hot_tier_config; - hot_tier_metadata.used_size = 0; - hot_tier_metadata.available_size = hot_tier_metadata.size; - hot_tier_metadata.oldest_date_time_entry = None; - if hot_tier_metadata.version.is_none() { - hot_tier_metadata.version = Some(crate::hottier::CURRENT_HOT_TIER_VERSION.to_string()); - } - - if let Err(e) = hot_tier_manager - .put_hot_tier(&stream_name, &mut hot_tier_metadata, &tenant_id) - .await - { - warn!( - "Failed to initialize hot tier metadata for stream {}: {}", - stream_name, e - ); - } - } - - Ok(()) -} - #[cfg(test)] mod test { use actix_web::body::MessageBody; diff --git a/src/handlers/http/modal/query/querier_logstream.rs b/src/handlers/http/modal/query/querier_logstream.rs index 80c674706..121fc1fca 100644 --- a/src/handlers/http/modal/query/querier_logstream.rs +++ b/src/handlers/http/modal/query/querier_logstream.rs @@ -32,6 +32,7 @@ use tracing::{error, warn}; pub static CREATE_STREAM_LOCK: Mutex<()> = Mutex::const_new(()); use crate::handlers::http::middleware::{CLUSTER_SECRET, CLUSTER_SECRET_HEADER}; +use crate::hottier::GLOBAL_HOTTIER; use crate::parseable::DEFAULT_TENANT; use crate::utils::get_user_from_request; use crate::{ @@ -47,7 +48,6 @@ use crate::{ modal::{NodeMetadata, NodeType}, }, }, - hottier::HotTierManager, parseable::{PARSEABLE, StreamNotFound}, stats, storage::{ObjectStoreFormat, StreamType}, @@ -85,7 +85,7 @@ pub async fn delete( ) } - if let Some(hot_tier_manager) = HotTierManager::global() + if let Some(hot_tier_manager) = GLOBAL_HOTTIER.get() && hot_tier_manager.check_stream_hot_tier_exists(&stream_name, &tenant_id) { hot_tier_manager diff --git a/src/handlers/http/modal/query_server.rs b/src/handlers/http/modal/query_server.rs index af1910ccd..10913799d 100644 --- a/src/handlers/http/modal/query_server.rs +++ b/src/handlers/http/modal/query_server.rs @@ -24,10 +24,12 @@ use crate::handlers::http::cluster; use crate::handlers::http::logstream; use crate::handlers::http::max_event_payload_size; use crate::handlers::http::middleware::{DisAllowRootUser, RouteExt}; -use crate::handlers::http::modal::initialize_hot_tier_metadata_on_startup; use crate::handlers::http::{base_path, prism_base_path}; use crate::handlers::http::{rbac, role}; +use crate::hottier::GLOBAL_HOTTIER; use crate::hottier::HotTierManager; +use crate::hottier::HotTierMessage; +use crate::hottier::hottier_runtime; use crate::rbac::role::Action; use crate::{analytics, migration, storage, sync}; use actix_web::web::{ServiceConfig, resource}; @@ -35,6 +37,7 @@ use actix_web::{Scope, web}; use actix_web_prometheus::PrometheusMetrics; use async_trait::async_trait; use bytes::Bytes; +use tokio::sync::mpsc; use tokio::sync::{OnceCell, oneshot}; use crate::Server; @@ -126,13 +129,26 @@ impl ParseableServer for QueryServer { analytics::init_analytics_scheduler()?; } - if let Some(hot_tier_manager) = HotTierManager::global() { - // Initialize hot tier metadata files for streams that have hot tier configuration - // but don't have local hot tier metadata files yet - if let Err(e) = initialize_hot_tier_metadata_on_startup(hot_tier_manager).await { - tracing::warn!("Failed to initialize hot tier metadata on startup: {}", e); - } - hot_tier_manager.download_from_s3()?; + // Initialize hot tier metadata files for streams that have hot tier configuration + // but don't have local hot tier metadata files yet + if let Some(htm) = PARSEABLE + .options + .hot_tier_storage_path + .as_ref() + .map(|hot_tier_path| { + // start hottier runtime + let (sender, receiver): ( + mpsc::UnboundedSender, + mpsc::UnboundedReceiver, + ) = mpsc::unbounded_channel(); + std::thread::spawn(|| hottier_runtime(receiver)); + + // set global hottier + + GLOBAL_HOTTIER.get_or_init(|| HotTierManager::new(hot_tier_path, sender)) + }) + { + htm.start_all_tasks().await; }; // Run sync on a background thread diff --git a/src/handlers/http/modal/server.rs b/src/handlers/http/modal/server.rs index 9a7b4ea69..6a0309075 100644 --- a/src/handlers/http/modal/server.rs +++ b/src/handlers/http/modal/server.rs @@ -27,13 +27,15 @@ use crate::handlers::http::demo_data::get_demo_data; use crate::handlers::http::health_check; use crate::handlers::http::max_event_payload_size; use crate::handlers::http::middleware::IntraClusterRequest; -use crate::handlers::http::modal::initialize_hot_tier_metadata_on_startup; use crate::handlers::http::prism_base_path; use crate::handlers::http::query; use crate::handlers::http::targets; use crate::handlers::http::users::dashboards; use crate::handlers::http::users::filters; +use crate::hottier::GLOBAL_HOTTIER; use crate::hottier::HotTierManager; +use crate::hottier::HotTierMessage; +use crate::hottier::hottier_runtime; use crate::metrics; use crate::migration; use crate::storage; @@ -49,6 +51,7 @@ use actix_web_prometheus::PrometheusMetrics; use actix_web_static_files::ResourceFiles; use async_trait::async_trait; use bytes::Bytes; +use tokio::sync::mpsc; use tokio::sync::oneshot; use crate::{ @@ -132,14 +135,25 @@ impl ParseableServer for Server { // local sync on init thread::spawn(sync_start); - if let Some(hot_tier_manager) = HotTierManager::global() { - // Initialize hot tier metadata files for streams that have hot tier configuration - // but don't have local hot tier metadata files yet - if let Err(e) = initialize_hot_tier_metadata_on_startup(hot_tier_manager).await { - tracing::warn!("Failed to initialize hot tier metadata on startup: {}", e); - } - hot_tier_manager.download_from_s3()?; - }; + if let Some(htm) = PARSEABLE + .options + .hot_tier_storage_path + .as_ref() + .map(|hot_tier_path| { + // start hottier runtime + let (sender, receiver): ( + mpsc::UnboundedSender, + mpsc::UnboundedReceiver, + ) = mpsc::unbounded_channel(); + std::thread::spawn(|| hottier_runtime(receiver)); + + // set global hottier + + GLOBAL_HOTTIER.get_or_init(|| HotTierManager::new(hot_tier_path, sender)) + }) + { + htm.start_all_tasks().await; + } // Run sync on a background thread let (cancel_tx, cancel_rx) = oneshot::channel(); diff --git a/src/hottier.rs b/src/hottier.rs index 8eba302fa..61341cdac 100644 --- a/src/hottier.rs +++ b/src/hottier.rs @@ -21,9 +21,9 @@ use std::{ collections::HashMap, io, path::{Path, PathBuf}, - sync::Arc, + sync::{Arc, OnceLock}, }; -use tokio::sync::{Mutex as AsyncMutex, RwLock as AsyncRwLock}; +use tokio::sync::{Mutex as AsyncMutex, RwLock as AsyncRwLock, mpsc}; use crate::{ catalog::manifest::{File, Manifest}, @@ -45,7 +45,21 @@ use std::time::Duration; use sysinfo::Disks; use tokio::fs::{self, DirEntry}; use tokio_stream::wrappers::ReadDirStream; -use tracing::{Instrument, error, info}; +use tracing::{Instrument, error, info, warn}; + +pub enum HotTierMessage { + StartTask(StreamKey), + KillTask(StreamKey), + // KillAll, + StartAll, +} + +pub static GLOBAL_HOTTIER: OnceLock = OnceLock::new(); + +pub static HOTTIER_RUNTIME: OnceCell<( + mpsc::UnboundedSender, + mpsc::UnboundedReceiver, +)> = OnceCell::new(); /// Floor a timestamp to the start of its minute (seconds + sub-second zeroed). /// Used to produce a stable per-tick anchor so all spans within one tick share @@ -78,6 +92,11 @@ pub struct StreamHotTier { /// commit, and per-date manifest writes. Downloads run outside the lock. struct StreamSyncState { sht: AsyncMutex, + /// Past-date keys (e.g. `date=2026-05-11`) whose local file count is + /// known to match the S3 manifest count. Historic phase skips fetching + /// these. Populated after a tick observes `local_count >= s3_count`. + /// In-memory only; rebuilt on restart. + completed_dates: AsyncRwLock>, } /// Hot-tier sync runs in two phases. Latest pulls files newer than @@ -89,7 +108,8 @@ enum SyncPhase { Historic, } -type StreamKey = (Option, String); +pub type StreamKey = (Option, String); +pub type HotTierResponse = Result<(), HotTierError>; struct StreamTasks { latest: tokio::task::JoinHandle<()>, @@ -101,19 +121,136 @@ pub struct HotTierManager { hot_tier_path: &'static Path, state_cache: AsyncRwLock>>, tasks: AsyncRwLock>, + sender: mpsc::UnboundedSender, +} + +#[tokio::main(flavor = "multi_thread")] +pub async fn hottier_runtime( + mut receiver: mpsc::UnboundedReceiver, + // sender: mpsc::UnboundedSender, +) { + while let Some(msg) = receiver.recv().await { + match msg { + HotTierMessage::StartTask((tenant_id, stream)) => { + tokio::spawn(async move { + if let Some(htm) = GLOBAL_HOTTIER.get() { + htm.spawn_stream_task_inner(stream, tenant_id).await; + } + }); + } + HotTierMessage::KillTask((tenant_id, stream)) => { + if let Some(htm) = GLOBAL_HOTTIER.get() { + htm.abort_stream_tasks(&stream, &tenant_id).await; + let path = if let Some(tenant_id) = tenant_id.as_ref() { + htm.hot_tier_path.join(tenant_id).join(&stream) + } else { + htm.hot_tier_path.join(&stream) + }; + let _ = fs::remove_dir_all(path) + .await + .map_err(|e| { + error!( + stream = %stream, + tenant = ?tenant_id, + error = ?e + ); + e + }) + .map_err(|e| { + error!( + stream=?stream, + tenant_id=?tenant_id, + error=?e, + "kill task" + ) + }); + htm.invalidate_state(&stream, &tenant_id).await; + } + } + HotTierMessage::StartAll => { + let htm = GLOBAL_HOTTIER.get().unwrap(); + let latest_min = PARSEABLE.options.hot_tier_latest_minutes; + let historic_min = PARSEABLE.options.hot_tier_historic_sync_minutes; + info!( + latest_minutes = latest_min, + historic_sync_minutes = historic_min, + "hot tier scheduler starting" + ); + + let this: &'static HotTierManager = htm; + let startup_span = tracing::info_span!("hottier.startup.bootstrap"); + let span = startup_span.clone(); + tokio::spawn( + async move { + // pstats hot tier may need to be created on boot before any tasks + // can pick it up. + if let Err(e) = this.create_pstats_hot_tier().await { + tracing::error!("Skipping pstats hot tier creation because of error: {e}"); + } + let tenants = if let Some(tenants) = PARSEABLE.list_tenants() { + tenants.into_iter().map(Some).collect::>() + } else { + vec![None] + }; + for tenant_id in tenants { + for stream in PARSEABLE.streams.list(&tenant_id) { + if this.check_stream_hot_tier_exists(&stream, &tenant_id) { + let tenant_id = tenant_id.clone(); + + tokio::spawn(async move { + this.spawn_stream_task_inner(stream, tenant_id).await; + }.instrument(span.clone())); + tokio::time::sleep(Duration::from_secs(2)).await; + } else { + // check for potential orphan directory on disk + let path = if let Some(tenant_id) = tenant_id.as_ref() { + this.hot_tier_path.join(tenant_id).join(stream) + } else { + this.hot_tier_path.join(stream) + }; + if path.exists() { + // delete this entire folder as stream meta says no hottier for stream + if let Err(e) = fs::remove_dir_all(&path).await { + tracing::error!( + "Unable to remove orphaned hottier dir- `{path:?}` with error- {e}" + ); + }; + } + } + } + } + } + .instrument(startup_span.clone()), + ); + } + } + } } impl HotTierManager { - pub fn new(hot_tier_path: &'static Path) -> Self { + pub fn new( + hot_tier_path: &'static Path, + sender: mpsc::UnboundedSender, + ) -> Self { std::fs::create_dir_all(hot_tier_path).unwrap(); HotTierManager { filesystem: LocalFileSystem::new(), hot_tier_path, state_cache: AsyncRwLock::new(HashMap::new()), tasks: AsyncRwLock::new(HashMap::new()), + sender, } } + #[tracing::instrument(name = "hottier.startup", skip(self))] + pub async fn start_all_tasks(&'static self) { + let _ = tokio::spawn(async move { + self.sender.send(HotTierMessage::StartAll).unwrap(); + }) + .instrument(tracing::Span::current()) + .await; + } + /// Lazy-load and cache the `StreamHotTier` for a (tenant, stream) pair. /// All sync-path mutations should acquire `state.sht.lock()`. async fn get_or_load_state( @@ -122,29 +259,36 @@ impl HotTierManager { tenant_id: &Option, ) -> Result, HotTierError> { let key: StreamKey = (tenant_id.clone(), stream.to_owned()); - if let Some(state) = self.state_cache.read().await.get(&key).cloned() { - return Ok(state); + { + if let Some(state) = self.state_cache.read().await.get(&key).cloned() { + return Ok(state); + } } // key not present, reconcile let sht = self.reconcile_stream(stream, tenant_id).await?; let state = Arc::new(StreamSyncState { sht: AsyncMutex::new(sht), + completed_dates: AsyncRwLock::new(HashSet::new()), }); - let mut cache = self.state_cache.write().await; - if cache.insert(key, state.clone()).is_some() { - tracing::warn!( - "Key- {:?} was absent during read lock but already exists after reconcile!", - (tenant_id, stream), - ); - }; + { + let mut cache = self.state_cache.write().await; + if cache.insert(key, state.clone()).is_some() { + tracing::warn!( + "Key- {:?} was absent during read lock but already exists after reconcile!", + (tenant_id, stream), + ); + }; + } Ok(state) } /// Drop cached state for a stream (used after delete). pub async fn invalidate_state(&self, stream: &str, tenant_id: &Option) { let key: StreamKey = (tenant_id.clone(), stream.to_owned()); - self.state_cache.write().await.remove(&key); + { + self.state_cache.write().await.remove(&key); + } } /// Walk the on-disk hot-tier directory for a stream and bring it into @@ -392,17 +536,6 @@ impl HotTierManager { Ok(()) } - /// Get a global - pub fn global() -> Option<&'static HotTierManager> { - static INSTANCE: OnceCell = OnceCell::new(); - - PARSEABLE - .options - .hot_tier_storage_path - .as_ref() - .map(|hot_tier_path| INSTANCE.get_or_init(|| HotTierManager::new(hot_tier_path))) - } - /// get the total hot tier size for all streams #[tracing::instrument( name = "hottier.get_hot_tiers_size", @@ -544,30 +677,22 @@ impl HotTierManager { err )] pub async fn delete_hot_tier( - &self, + &'static self, stream: &str, tenant_id: &Option, ) -> Result<(), HotTierError> { if !self.check_stream_hot_tier_exists(stream, tenant_id) { return Err(HotTierValidationError::NotFound(stream.to_owned()).into()); } - // Stop loops before tearing down the directory so no in-flight tick - // re-creates files mid-delete. - self.abort_stream_tasks(stream, tenant_id).await; - let path = if let Some(tenant_id) = tenant_id.as_ref() { - self.hot_tier_path.join(tenant_id).join(stream) - } else { - self.hot_tier_path.join(stream) - }; - fs::remove_dir_all(path).await.map_err(|e| { - error!( - stream = %stream, - tenant = ?tenant_id, - error = ?e - ); - e - })?; - self.invalidate_state(stream, tenant_id).await; + let stream_name = stream.to_owned(); + let tenant = tenant_id.to_owned(); + let _ = tokio::spawn(async move { + self.sender + .send(HotTierMessage::KillTask((tenant, stream_name))) + .unwrap(); + }) + .instrument(tracing::Span::current()) + .await; Ok(()) } @@ -615,86 +740,34 @@ impl HotTierManager { #[tracing::instrument(name = "hottier.abort", skip(self))] pub async fn abort_all(&self) { - let guard = self.tasks.write().await; - for (streamkey, task) in guard.iter() { - task.latest.abort(); - task.historic.abort(); - info!("aborted hot tier tasks for- {streamkey:?}"); - } - } - - /// Discover hot-tier-enabled streams at boot and spawn a per-stream pair - /// of (Latest, Historic) loops for each. New streams added later acquire - /// their own loops via `spawn_stream_tasks` from the PUT hot-tier handler. - #[tracing::instrument(name = "hottier.startup", skip(self), err)] - pub fn download_from_s3<'a>(&'a self) -> Result<(), HotTierError> - where - 'a: 'static, - { - let latest_min = PARSEABLE.options.hot_tier_latest_minutes; - let historic_min = PARSEABLE.options.hot_tier_historic_sync_minutes; - info!( - latest_minutes = latest_min, - historic_sync_minutes = historic_min, - "hot tier scheduler starting" - ); - - let this: &'static HotTierManager = self; - let startup_span = tracing::info_span!("hottier.startup.bootstrap"); - let span = startup_span.clone(); - tokio::spawn( - async move { - // pstats hot tier may need to be created on boot before any tasks - // can pick it up. - if let Err(e) = this.create_pstats_hot_tier().await { - tracing::error!("Skipping pstats hot tier creation because of error: {e}"); - } - let tenants = if let Some(tenants) = PARSEABLE.list_tenants() { - tenants.into_iter().map(Some).collect::>() - } else { - vec![None] - }; - for tenant_id in tenants { - for stream in PARSEABLE.streams.list(&tenant_id) { - if this.check_stream_hot_tier_exists(&stream, &tenant_id) { - let tenant_id = tenant_id.clone(); - - tokio::spawn(async move { - this.spawn_stream_tasks(stream, tenant_id).await; - }.instrument(span.clone())); - tokio::time::sleep(Duration::from_secs(8)).await; - } else { - // check for potential orphan directory on disk - let path = if let Some(tenant_id) = tenant_id.as_ref() { - self.hot_tier_path.join(tenant_id).join(stream) - } else { - self.hot_tier_path.join(stream) - }; - if path.exists() { - // delete this entire folder as stream meta says no hottier for stream - if let Err(e) = fs::remove_dir_all(&path).await { - tracing::error!( - "Unable to remove orphaned hottier dir- `{path:?}` with error- {e}" - ); - }; - } - } - } - } + { + let guard = self.tasks.write().await; + for (streamkey, task) in guard.iter() { + task.latest.abort(); + task.historic.abort(); + info!("aborted hot tier tasks for- {streamkey:?}"); } - .instrument(startup_span.clone()), - ); - Ok(()) + } } - /// Spawn (Latest, Historic) loops for a single stream. Idempotent: - /// if tasks already exist for this (tenant, stream), no-op. #[tracing::instrument( - name = "hottier.spawn_stream_tasks", + name = "hottier.spawn_stream_task", skip(self), fields(stream = %stream, tenant = ?tenant_id) )] - pub async fn spawn_stream_tasks(&'static self, stream: String, tenant_id: Option) { + pub async fn spawn_stream_task(&'static self, stream: String, tenant_id: Option) { + let _ = tokio::spawn(async move { + self.sender + .send(HotTierMessage::StartTask((tenant_id, stream))) + .unwrap(); + }) + .instrument(tracing::Span::current()) + .await; + } + + /// Spawn (Latest, Historic) loops for a single stream. Idempotent: + /// if tasks already exist for this (tenant, stream), no-op. + async fn spawn_stream_task_inner(&'static self, stream: String, tenant_id: Option) { let key: StreamKey = (tenant_id.clone(), stream.clone()); { let tasks = self.tasks.read().await; @@ -725,7 +798,6 @@ impl HotTierManager { anchor = %anchor ); async { - info!("stream tick fired"); if let Err(err) = self .process_stream(s.clone(), t.clone(), SyncPhase::Latest, anchor) .await @@ -752,7 +824,6 @@ impl HotTierManager { anchor = %anchor ); async { - info!("stream tick fired"); if let Err(err) = self .process_stream(s.clone(), t.clone(), SyncPhase::Historic, anchor) .await @@ -766,10 +837,12 @@ impl HotTierManager { } }); - let mut tasks = self.tasks.write().await; - if let Some(old) = tasks.insert(key, StreamTasks { latest, historic }) { - old.latest.abort(); - old.historic.abort(); + { + let mut tasks = self.tasks.write().await; + if let Some(old) = tasks.insert(key, StreamTasks { latest, historic }) { + old.latest.abort(); + old.historic.abort(); + } } } @@ -777,10 +850,12 @@ impl HotTierManager { /// will be enqueued for the stream after this returns. async fn abort_stream_tasks(&self, stream: &str, tenant_id: &Option) { let key: StreamKey = (tenant_id.clone(), stream.to_owned()); - if let Some(t) = self.tasks.write().await.remove(&key) { - t.latest.abort(); - t.historic.abort(); - info!(stream = %stream, tenant = ?tenant_id, "aborted per-stream hot tier tasks"); + { + if let Some(t) = self.tasks.write().await.remove(&key) { + t.latest.abort(); + t.historic.abort(); + info!(stream = %stream, tenant = ?tenant_id, "aborted per-stream hot tier tasks"); + } } } @@ -847,8 +922,12 @@ impl HotTierManager { phase: SyncPhase, anchor: DateTime, ) -> Result<(), HotTierError> { + // Load state up-front so the Historic candidate filter can consult + // `completed_dates` and so post-download completion marking has + // somewhere to write. Lazy-cached; steady-state cost is zero. + let state = self.get_or_load_state(stream, tenant_id).await?; let latest_minutes = PARSEABLE.options.hot_tier_latest_minutes; - let latest_window = chrono::Duration::minutes(latest_minutes.try_into().unwrap()); + let latest_window = chrono::Duration::minutes(latest_minutes as i64); let historic_cutoff = anchor - latest_window; let historic_cutoff_naive = historic_cutoff.naive_utc(); let today_date_key = format!("date={}", anchor.date_naive()); @@ -865,9 +944,9 @@ impl HotTierManager { while d <= end { out.push(format!("date={d}")); d = d.succ_opt().unwrap_or(d); - if out.len() > 365 { - break; - } + // if out.len() > 365 { + // break; + // } } out } @@ -879,34 +958,67 @@ impl HotTierManager { .into_iter() .map(|d| format!("date={d}")) .collect::>(); + info!( + stream = ?stream, + tenant_id=?tenant_id, + local_dates=?local + ); let s3 = PARSEABLE - .storage() - .get_object_store() + .hottier_connection_pool .list_dates(stream, tenant_id) .await .unwrap_or_default(); - + info!( + stream = ?stream, + tenant_id=?tenant_id, + s3_dates=?s3 + ); let mut union: std::collections::BTreeSet = local.into_iter().collect(); union.extend(s3); - let mut out = Vec::new(); - for date_key in union { - // drop today and anything >= today (Latest handles those) - if date_key.as_str() >= today_date_key.as_str() { - continue; - } - if self.is_locally_complete(stream, &date_key, tenant_id).await { - continue; + let mut out = { + let completed = state.completed_dates.read().await; + info!( + stream = ?stream, + tenant_id=?tenant_id, + completed_dates_len=?completed.len(), + "acquired completed_dates lock" + ); + let mut out = Vec::new(); + for date_key in union { + // drop today and anything >= today (Latest handles those) + if date_key.as_str() >= today_date_key.as_str() { + continue; + } + if completed.contains(&date_key) { + continue; + } + out.push(date_key); } - out.push(date_key); - } + + info!( + stream = ?stream, + tenant_id=?tenant_id, + "dropping completed_dates lock" + ); + out + }; + info!( + stream = ?stream, + tenant_id=?tenant_id, + "dropped completed_dates lock" + ); // Newest-first: discover newest missing past date first. out.sort(); out.reverse(); + info!( + stream = ?stream, + tenant_id=?tenant_id, + selected_dates=?out + ); out } }; - tracing::Span::current().record("candidate_dates", candidate_dates.len()); if candidate_dates.is_empty() { info!( @@ -934,7 +1046,7 @@ impl HotTierManager { e.to_detail(), ))) })?; - + warn!("fetched manifests {stream}"); // Build flat work list across all candidate dates: keep only files // matching this phase's cutoff and not already on disk. let mut work: Vec<(NaiveDate, chrono::NaiveDateTime, File, PathBuf)> = Vec::new(); @@ -972,12 +1084,36 @@ impl HotTierManager { // Newest first by file timestamp. work.sort_by_key(|b| std::cmp::Reverse(b.1)); + warn!("made work list {stream}"); + // Historic ticks cap per-tick work so a fresh stream with months of + // backlog doesn't pin the loop for tens of minutes. Remainder flows + // through subsequent Historic ticks. Latest stays uncapped — its + // window is already time-bounded. + let per_tick_cap = PARSEABLE.options.historic_per_tick_cap as usize; + let truncated = if matches!(phase, SyncPhase::Historic) && work.len() > per_tick_cap { + let dropped = work.len() - per_tick_cap; + work.truncate(per_tick_cap); + warn!("truncated work list {stream} {dropped}"); + dropped + } else { + 0 + }; let work_count = work.len(); let total_bytes: u64 = work.iter().map(|(_, _, f, _)| f.file_size).sum(); tracing::Span::current() .record("work_count", work_count) .record("total_bytes", total_bytes); + if truncated > 0 { + info!( + stream = %stream, + tenant = ?tenant_id, + phase = ?phase, + cap = per_tick_cap, + deferred = truncated, + "historic per-tick cap hit; deferring rest to next tick" + ); + } if work.is_empty() { info!( stream = %stream, @@ -996,7 +1132,6 @@ impl HotTierManager { "work list built" ); - let state = self.get_or_load_state(stream, tenant_id).await?; let concurrency = PARSEABLE.options.hot_tier_files_per_stream_concurrency; // Reservation failure (out of disk + nothing to evict) is sticky: @@ -1006,80 +1141,151 @@ impl HotTierManager { let stream_owned = stream.to_owned(); let tenant_owned = tenant_id.clone(); - let results: Vec> = futures::stream::iter(work) - .map(|(date, _dt, file, parquet_path)| { - let state = state.clone(); - let stream = stream_owned.clone(); - let tenant_id = tenant_owned.clone(); - let stop = stop.clone(); - async move { - if stop.load(std::sync::atomic::Ordering::Relaxed) { - return Ok(()); + // take lock over sht and reserve + // for all files in worklist + warn!("reserving for {stream}"); + let reserved = { + let mut sht = state.sht.lock().await; + self.reserve_disk_budget(stream, &work, tenant_id, phase, &mut sht) + .await? + }; + warn!("reserved {reserved} for {stream}"); + let results: Vec> = if reserved { + futures::stream::iter(work) + .map(|(date, _dt, file, parquet_path)| { + let state = state.clone(); + let stream = stream_owned.clone(); + let tenant_id = tenant_owned.clone(); + let stop = stop.clone(); + async move { + warn!("processing {parquet_path:?} for {stream}"); + if stop.load(std::sync::atomic::Ordering::Relaxed) { + return Ok(()); + } + let processed = self + .process_parquet_file_concurrent( + &stream, + &file, + parquet_path, + date, + &tenant_id, + &state, + phase, + ) + .await?; + warn!("processed for {stream}\n"); + if !processed && !stop.swap(true, std::sync::atomic::Ordering::Relaxed) { + info!( + stream = %stream, + tenant = ?tenant_id, + phase = ?phase, + "sticky stop: halting further reservations this tick" + ); + warn!("stopping reservations for {stream}"); + } + Ok(()) } - let processed = self - .process_parquet_file_concurrent( - &stream, - &file, - parquet_path, - date, - &tenant_id, - &state, - phase, - ) - .await?; - if !processed && !stop.swap(true, std::sync::atomic::Ordering::Relaxed) { + }) + .buffered(concurrency as usize) + .collect() + .await + } else { + vec![] + }; + + for r in results { + r?; + } + + // Mark Historic candidate dates as complete when local count now + // matches the S3 manifest count. Skip when truncation deferred work + // — by definition more files exist than we downloaded. + if matches!(phase, SyncPhase::Historic) && truncated == 0 { + let mut newly_complete = Vec::new(); + for date_key in &candidate_dates { + let s3_count: usize = s3_manifests + .get(date_key) + .map(|v| v.iter().map(|m| m.files.len()).sum()) + .unwrap_or(0); + if s3_count == 0 { + continue; + } + let local_count = self + .local_manifest_file_count(stream, date_key, tenant_id) + .await; + if local_count >= s3_count { + newly_complete.push(date_key.clone()); + } + } + if !newly_complete.is_empty() { + { + let mut completed = state.completed_dates.write().await; + for d in newly_complete { info!( stream = %stream, tenant = ?tenant_id, - phase = ?phase, - "sticky stop: halting further reservations this tick" + date = %d, + "marking date locally complete" ); + completed.insert(d); } - Ok(()) } - }) - .buffer_unordered(concurrency) - .collect() - .await; - - for r in results { - r?; + } } Ok(()) } - /// Reserve disk budget under the per-stream lock, download outside the lock, - /// then commit usage + per-date manifest under the lock again. - /// Returns false when no budget is available (caller should stop scheduling - /// further work for this stream). + /// Count files recorded in a date's local `hottier.manifest.json`. + /// Returns 0 if the manifest is missing or fails to parse. + async fn local_manifest_file_count( + &self, + stream: &str, + date_key: &str, + tenant_id: &Option, + ) -> usize { + let date_dir = if let Some(tenant) = tenant_id.as_ref() { + self.hot_tier_path.join(tenant).join(stream).join(date_key) + } else { + self.hot_tier_path.join(stream).join(date_key) + }; + let manifest_path = date_dir.join("hottier.manifest.json"); + if !manifest_path.exists() { + return 0; + } + match fs::read(&manifest_path).await { + Ok(bytes) => serde_json::from_slice::(&bytes) + .map(|m| m.files.len()) + .unwrap_or(0), + Err(_) => 0, + } + } #[allow(clippy::too_many_arguments)] #[tracing::instrument( - name = "hottier.process_parquet_file", - skip(self, parquet_file, parquet_path, state), + name = "hottier.reserve_disk_budget", + skip(self, work, sht), fields( stream = %stream, tenant = ?tenant_id, phase = ?phase, - date = %date, - file = %parquet_file.file_path, - file_size = parquet_file.file_size ), err )] - async fn process_parquet_file_concurrent( + async fn reserve_disk_budget( &self, stream: &str, - parquet_file: &File, - parquet_path: PathBuf, - date: NaiveDate, + // parquet_file: &File, + // parquet_path: PathBuf, + // date: NaiveDate, + work: &Vec<(NaiveDate, chrono::NaiveDateTime, File, PathBuf)>, tenant_id: &Option, - state: &Arc, phase: SyncPhase, + sht: &mut StreamHotTier, ) -> Result { // RESERVE - { - let mut sht = state.sht.lock().await; + + for (_, _, parquet_file, parquet_path) in work { + // let mut sht = state.sht.lock().await; info!( stream = %stream, tenant = ?tenant_id, @@ -1106,8 +1312,8 @@ impl HotTierManager { if !self .cleanup_hot_tier_old_data( stream, - &mut sht, - &parquet_path, + sht, + parquet_path, parquet_file.file_size, tenant_id, ) @@ -1161,7 +1367,6 @@ impl HotTierManager { ); 0 }; - self.put_hot_tier(stream, &mut sht, tenant_id).await?; info!( stream = %stream, tenant = ?tenant_id, @@ -1171,7 +1376,38 @@ impl HotTierManager { "reserved" ); } + self.put_hot_tier(stream, sht, tenant_id).await?; + Ok(true) + } + /// Reserve disk budget under the per-stream lock, download outside the lock, + /// then commit usage + per-date manifest under the lock again. + /// Returns false when no budget is available (caller should stop scheduling + /// further work for this stream). + #[allow(clippy::too_many_arguments)] + #[tracing::instrument( + name = "hottier.process_parquet_file", + skip(self, parquet_file, parquet_path, state), + fields( + stream = %stream, + tenant = ?tenant_id, + phase = ?phase, + date = %date, + file = %parquet_file.file_path, + file_size = parquet_file.file_size + ), + err + )] + async fn process_parquet_file_concurrent( + &self, + stream: &str, + parquet_file: &File, + parquet_path: PathBuf, + date: NaiveDate, + tenant_id: &Option, + state: &Arc, + phase: SyncPhase, + ) -> Result { // DOWNLOAD (no lock held) let parquet_file_path = RelativePathBuf::from(parquet_file.file_path.clone()); fs::create_dir_all(parquet_path.parent().unwrap()).await?; @@ -1184,8 +1420,7 @@ impl HotTierManager { ); let dl_start = std::time::Instant::now(); let download_result = PARSEABLE - .storage - .get_object_store() + .hottier_connection_pool .parallel_chunked_download(&parquet_file_path, tenant_id, parquet_path.clone()) .await; let dl_elapsed = dl_start.elapsed(); @@ -1345,34 +1580,6 @@ impl HotTierManager { } } - /// A past date is treated as fully synced when its on-disk hottier - /// manifest exists and lists at least one parquet. Used by the Historic - /// phase to skip the metastore fetch for dates we already have data for. - /// `date_key` is the partition directory name (e.g. "date=2026-05-12"). - async fn is_locally_complete( - &self, - stream: &str, - date_key: &str, - tenant_id: &Option, - ) -> bool { - let date_dir = if let Some(tenant) = tenant_id.as_ref() { - self.hot_tier_path.join(tenant).join(stream).join(date_key) - } else { - self.hot_tier_path.join(stream).join(date_key) - }; - let manifest_path = date_dir.join("hottier.manifest.json"); - if !manifest_path.exists() { - return false; - } - match fs::read(&manifest_path).await { - Ok(bytes) => match serde_json::from_slice::(&bytes) { - Ok(m) => !m.files.is_empty(), - Err(_) => false, - }, - Err(_) => false, - } - } - /// Returns the list of manifest files present in hot tier directory for the stream pub async fn get_hot_tier_manifest_files( &self, diff --git a/src/metastore/metastores/object_store_metastore.rs b/src/metastore/metastores/object_store_metastore.rs index 5adf93cef..d22f87226 100644 --- a/src/metastore/metastores/object_store_metastore.rs +++ b/src/metastore/metastores/object_store_metastore.rs @@ -28,7 +28,7 @@ use chrono::{DateTime, Utc}; use dashmap::DashMap; use relative_path::RelativePathBuf; use tonic::async_trait; -use tracing::warn; +use tracing::{info, warn}; use ulid::Ulid; use crate::{ @@ -928,44 +928,103 @@ impl Metastore for ObjectStoreMetastore { .await?) } - /// Fetch all `Manifest` files + /// Fetch manifest files for the given date keys in parallel. + /// + /// Per-date work runs concurrently via `futures::future::try_join_all` + /// so a slow LIST on one date does not block the others. Each date's + /// timing is logged at info level so any pathologically slow dates + /// (S3 retries, oversized prefixes) are visible without needing extra + /// span instrumentation. async fn get_manifest_files_for_dates( &self, stream_name: &str, tenant_id: &Option, dates: &[String], ) -> Result>, MetastoreError> { - let mut result_file_list: BTreeMap> = BTreeMap::new(); - for date in dates { - let date_path = if let Some(tenant) = tenant_id { - object_store::path::Path::from(format!("{}/{}/{}", tenant, stream_name, date)) - } else { - object_store::path::Path::from(format!("{}/{}", stream_name, date)) - }; - let resp = match self.storage.list_with_delimiter(Some(date_path)).await { - Ok(r) => r, - Err(ObjectStorageError::NoSuchKey(_)) => continue, - Err(e) => return Err(e.into()), - }; + let total_start = std::time::Instant::now(); + let date_futures = dates.iter().map(|date| { + // let storage = self.storage.clone(); + let stream = stream_name.to_string(); + let tenant = tenant_id.clone(); + let date = date.clone(); + async move { + let t_start = std::time::Instant::now(); + let date_path = if let Some(tenant) = tenant.as_ref() { + object_store::path::Path::from(format!("{}/{}/{}", tenant, &stream, &date)) + } else { + object_store::path::Path::from(format!("{}/{}", &stream, &date)) + }; - let manifest_paths: Vec = resp - .objects - .iter() - .filter(|name| name.location.filename().unwrap().ends_with("manifest.json")) - .map(|name| name.location.to_string()) - .collect(); + let t_list = std::time::Instant::now(); + let resp = match self.storage.list_with_delimiter(Some(date_path)).await { + Ok(r) => r, + Err(ObjectStorageError::NoSuchKey(_)) => { + info!( + stream = %stream, + tenant = ?tenant, + date = %date, + list_ms = t_list.elapsed().as_millis() as u64, + "manifest fetch: date prefix not found" + ); + return Ok::<(String, Vec), MetastoreError>((date, Vec::new())); + } + Err(e) => return Err(e.into()), + }; + let list_ms = t_list.elapsed().as_millis() as u64; - for path in manifest_paths { - let bytes = self - .storage - .get_object(&RelativePathBuf::from(path), tenant_id) - .await?; - result_file_list - .entry(date.clone()) - .or_default() - .push(serde_json::from_slice::(&bytes)?); + let manifest_paths: Vec = resp + .objects + .iter() + .filter(|name| name.location.filename().unwrap().ends_with("manifest.json")) + .map(|name| name.location.to_string()) + .collect(); + let manifest_count = manifest_paths.len(); + + let t_gets = std::time::Instant::now(); + let mut manifests: Vec = Vec::with_capacity(manifest_count); + for path in manifest_paths { + let bytes = self + .storage + .get_object(&RelativePathBuf::from(path), &tenant) + .await?; + manifests.push(serde_json::from_slice::(&bytes)?); + } + let gets_ms = t_gets.elapsed().as_millis() as u64; + let total_ms = t_start.elapsed().as_millis() as u64; + + if total_ms > 100 { + info!( + stream = %stream, + tenant = ?tenant, + date = %date, + list_ms, + gets_ms, + total_ms, + manifest_count, + "manifest fetch per-date timing" + ); + } + Ok((date, manifests)) + } + }); + + let results: Vec<(String, Vec)> = + futures::future::try_join_all(date_futures).await?; + + let mut result_file_list: BTreeMap> = BTreeMap::new(); + for (date, manifests) in results { + if !manifests.is_empty() { + result_file_list.insert(date, manifests); } } + info!( + stream = %stream_name, + tenant = ?tenant_id, + dates = dates.len(), + populated = result_file_list.len(), + total_ms = total_start.elapsed().as_millis() as u64, + "get_manifest_files_for_dates done" + ); Ok(result_file_list) } diff --git a/src/parseable/mod.rs b/src/parseable/mod.rs index 8d553fbc9..b2eef36ea 100644 --- a/src/parseable/mod.rs +++ b/src/parseable/mod.rs @@ -75,8 +75,8 @@ use crate::{ }, static_schema::{StaticSchema, convert_static_schema_to_arrow_schema}, storage::{ - ObjectStorageError, ObjectStorageProvider, ObjectStoreFormat, Owner, Permisssion, - StorageMetadata, StreamType, put_remote_metadata, + ObjectStorage, ObjectStorageError, ObjectStorageProvider, ObjectStoreFormat, Owner, + Permisssion, StorageMetadata, StreamType, put_remote_metadata, }, tenants::{Service, TENANT_METADATA}, validator, @@ -173,13 +173,14 @@ pub static PARSEABLE: Lazy = Lazy::new(|| { let metastore = ObjectStoreMetastore { storage: args.storage.construct_client(), }; - + let hottier_connection_pool = args.storage.construct_client(); Parseable::new( args.options, #[cfg(feature = "kafka")] args.kafka, Arc::new(args.storage), Arc::new(metastore), + hottier_connection_pool, ) } StorageOptions::S3(args) => { @@ -187,12 +188,14 @@ pub static PARSEABLE: Lazy = Lazy::new(|| { let metastore = ObjectStoreMetastore { storage: args.storage.construct_client(), }; + let hottier_connection_pool = args.storage.construct_client(); Parseable::new( args.options, #[cfg(feature = "kafka")] args.kafka, Arc::new(args.storage), Arc::new(metastore), + hottier_connection_pool, ) } StorageOptions::Blob(args) => { @@ -200,12 +203,14 @@ pub static PARSEABLE: Lazy = Lazy::new(|| { let metastore = ObjectStoreMetastore { storage: args.storage.construct_client(), }; + let hottier_connection_pool = args.storage.construct_client(); Parseable::new( args.options, #[cfg(feature = "kafka")] args.kafka, Arc::new(args.storage), Arc::new(metastore), + hottier_connection_pool, ) } StorageOptions::Gcs(args) => { @@ -213,12 +218,14 @@ pub static PARSEABLE: Lazy = Lazy::new(|| { let metastore = ObjectStoreMetastore { storage: args.storage.construct_client(), }; + let hottier_connection_pool = args.storage.construct_client(); Parseable::new( args.options, #[cfg(feature = "kafka")] args.kafka, Arc::new(args.storage), Arc::new(metastore), + hottier_connection_pool, ) } } @@ -238,6 +245,8 @@ pub struct Parseable { pub tenants: Arc>>, /// metastore pub metastore: Arc, + /// Hot-tier connection pool + pub hottier_connection_pool: Arc, /// Used to configure the kafka connector #[cfg(feature = "kafka")] pub kafka_config: KafkaConfig, @@ -249,11 +258,13 @@ impl Parseable { #[cfg(feature = "kafka")] kafka_config: KafkaConfig, storage: Arc, metastore: Arc, + hottier_connection_pool: Arc, ) -> Self { Parseable { options: Arc::new(options), storage, metastore, + hottier_connection_pool, streams: Streams::default(), tenants: Arc::new(RwLock::new(vec![])), #[cfg(feature = "kafka")] diff --git a/src/query/stream_schema_provider.rs b/src/query/stream_schema_provider.rs index 6d4005972..97afa1cfb 100644 --- a/src/query/stream_schema_provider.rs +++ b/src/query/stream_schema_provider.rs @@ -55,7 +55,7 @@ use crate::{ snapshot::{ManifestItem, Snapshot}, }, event::DEFAULT_TIMESTAMP_KEY, - hottier::HotTierManager, + hottier::{GLOBAL_HOTTIER, HotTierManager}, metrics::{QUERY_CACHE_HIT, increment_files_scanned_in_query_by_date}, option::Mode, parseable::{DEFAULT_TENANT, PARSEABLE, STREAM_EXISTS}, @@ -613,7 +613,7 @@ impl TableProvider for StandardTableProvider { } // Hot tier data fetch - if let Some(hot_tier_manager) = HotTierManager::global() + if let Some(hot_tier_manager) = GLOBAL_HOTTIER.get() && hot_tier_manager.check_stream_hot_tier_exists(&self.stream, &self.tenant_id) { self.get_hottier_exectuion_plan( diff --git a/src/storage/azure_blob.rs b/src/storage/azure_blob.rs index 1661d16fc..18f20191a 100644 --- a/src/storage/azure_blob.rs +++ b/src/storage/azure_blob.rs @@ -275,12 +275,10 @@ impl BlobStore { tracing::Span::current() .record("total_bytes", total) .record("chunks", chunk_count); - let client = self.client.clone(); let semaphore = Arc::new(tokio::sync::Semaphore::new(concurrency as usize)); futures::stream::iter(ranges) .map(|r| { - let client = client.clone(); let src = src.clone(); let std_file = std_file.clone(); let semaphore = semaphore.clone(); @@ -288,7 +286,7 @@ impl BlobStore { let _permit = semaphore.acquire_owned().await.map_err(|e| { ObjectStorageError::Custom(format!("semaphore closed: {e}")) })?; - let bytes = client.get_range(&src, r.clone()).await?; + let bytes = self.client.get_range(&src, r.clone()).await?; let offset = r.start; tokio::task::spawn_blocking(move || -> std::io::Result<()> { crate::storage::write_all_at(&std_file, &bytes, offset) diff --git a/src/storage/gcs.rs b/src/storage/gcs.rs index 56bcc71a9..01c98848f 100644 --- a/src/storage/gcs.rs +++ b/src/storage/gcs.rs @@ -242,12 +242,10 @@ impl Gcs { tracing::Span::current() .record("total_bytes", total) .record("chunks", chunk_count); - let client = self.client.clone(); let semaphore = Arc::new(tokio::sync::Semaphore::new(concurrency as usize)); futures::stream::iter(ranges) .map(|r| { - let client = client.clone(); let src = src.clone(); let std_file = std_file.clone(); let semaphore = semaphore.clone(); @@ -255,7 +253,7 @@ impl Gcs { let _permit = semaphore.acquire_owned().await.map_err(|e| { ObjectStorageError::Custom(format!("semaphore closed: {e}")) })?; - let bytes = client.get_range(&src, r.clone()).await?; + let bytes = self.client.get_range(&src, r.clone()).await?; let offset = r.start; tokio::task::spawn_blocking(move || -> std::io::Result<()> { crate::storage::write_all_at(&std_file, &bytes, offset) @@ -619,8 +617,7 @@ impl ObjectStorage for Gcs { } }; - let store: Arc = self.client.clone(); - let buf = object_store::buffered::BufReader::new(store, &meta); + let buf = object_store::buffered::BufReader::new(self.client.clone(), &meta); Ok(buf) } diff --git a/src/storage/mod.rs b/src/storage/mod.rs index df120cacb..e40100195 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -121,7 +121,7 @@ pub const CURRENT_OBJECT_STORE_VERSION: &str = "v7"; pub const CURRENT_SCHEMA_VERSION: &str = "v7"; const CONNECT_TIMEOUT_SECS: u64 = 5; -const REQUEST_TIMEOUT_SECS: u64 = 300; +const REQUEST_TIMEOUT_SECS: u64 = 30; #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct ObjectStoreFormat { diff --git a/src/storage/s3.rs b/src/storage/s3.rs index dd3013952..b66bd5a6e 100644 --- a/src/storage/s3.rs +++ b/src/storage/s3.rs @@ -236,6 +236,8 @@ impl S3Config { let mut client_options = ClientOptions::default() .with_allow_http(true) .with_connect_timeout(Duration::from_secs(CONNECT_TIMEOUT_SECS)) + .with_pool_idle_timeout(Duration::from_secs(15)) + .with_pool_max_idle_per_host(128) .with_timeout(Duration::from_secs(REQUEST_TIMEOUT_SECS)); if self.skip_tls { @@ -243,7 +245,7 @@ impl S3Config { } let retry_config = RetryConfig { max_retries: 5, - retry_timeout: Duration::from_secs(30), + retry_timeout: Duration::from_secs(5), backoff: BackoffConfig::default(), }; @@ -403,12 +405,11 @@ impl S3 { tracing::Span::current() .record("total_bytes", total) .record("chunks", chunk_count); - let client = Arc::new(self.client.clone()); + let semaphore = Arc::new(tokio::sync::Semaphore::new(concurrency as usize)); futures::stream::iter(ranges) .map(|r| { - let client = client.clone(); let src = src.clone(); let std_file = std_file.clone(); let semaphore = semaphore.clone(); @@ -416,7 +417,7 @@ impl S3 { let _permit = semaphore.acquire_owned().await.map_err(|e| { ObjectStorageError::Custom(format!("semaphore closed: {e}")) })?; - let bytes = client.get_range(&src, r.clone()).await?; + let bytes = self.client.get_range(&src, r.clone()).await?; let offset = r.start; tokio::task::spawn_blocking(move || -> std::io::Result<()> { crate::storage::write_all_at(&std_file, &bytes, offset) @@ -805,8 +806,7 @@ impl ObjectStorage for S3 { } }; - let store: Arc = Arc::new(self.client.clone()); - let buf = object_store::buffered::BufReader::new(store, &meta); + let buf = object_store::buffered::BufReader::new(self.client.clone(), &meta); Ok(buf) } @@ -1017,12 +1017,6 @@ impl ObjectStorage for S3 { Ok(result?) } - #[tracing::instrument( - name = "s3.check", - skip(self), - fields(tenant = ?tenant_id), - err - )] #[tracing::instrument( name = "s3.check", skip(self), diff --git a/src/telemetry.rs b/src/telemetry.rs index 1a2212bda..77943572e 100644 --- a/src/telemetry.rs +++ b/src/telemetry.rs @@ -16,6 +16,7 @@ * */ +use datafusion::common::HashSet; use opentelemetry_otlp::Protocol; use opentelemetry_otlp::SpanExporter; use opentelemetry_otlp::WithExportConfig; @@ -24,11 +25,43 @@ use opentelemetry_sdk::{ propagation::TraceContextPropagator, trace::{BatchSpanProcessor, SdkTracerProvider}, }; +use tracing::Metadata; +use tracing_subscriber::layer::Context; +use tracing_subscriber::layer::Filter; + +use crate::parseable::PARSEABLE; // Consts describing the env vars const OTEL_EXPORTER_OTLP_ENDPOINT: &str = "OTEL_EXPORTER_OTLP_ENDPOINT"; const OTEL_EXPORTER_OTLP_TRACES_ENDPOINT: &str = "OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"; const OTEL_EXPORTER_OTLP_PROTOCOL: &str = "OTEL_EXPORTER_OTLP_PROTOCOL"; +pub struct ModuleFilter { + trace_modules: HashSet, +} + +impl ModuleFilter { + fn new() -> Option { + if let Some(modules) = PARSEABLE.options.trace_modules.as_ref() { + let trace_modules = modules.split(",").map(|v| v.to_owned()).collect(); + Some(Self { trace_modules }) + } else { + None + } + } +} + +impl Filter for ModuleFilter { + fn enabled(&self, meta: &Metadata<'_>, _cx: &Context<'_, S>) -> bool { + if let Some(path) = meta.module_path() + && self.trace_modules.contains(path) + { + true + } else { + false + } + } +} + /// Initialise an OTLP tracer provider. /// /// **Required env var:** @@ -126,3 +159,7 @@ pub fn init_tracing(service: &'static str) -> Option { Some(provider) } + +pub fn module_filter() -> Option { + ModuleFilter::new() +} diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 237094444..a561215ac 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -40,7 +40,16 @@ use actix_web::http::header::HeaderMap; use actix_web::{FromRequest, HttpRequest}; use actix_web_httpauth::extractors::basic::BasicAuth; use chrono::{NaiveDate, NaiveDateTime, NaiveTime, Utc}; +use once_cell::sync::Lazy; use regex::Regex; + +/// Compiled once at startup; reused for every `extract_datetime` call. +/// Recompiling per call was the hot-path bottleneck for hot-tier work-list +/// construction (~50μs per call * 100k+ files per tick). +static PARTITION_DATETIME_RE: Lazy = Lazy::new(|| { + Regex::new(r"date=(\d{4}-\d{2}-\d{2})/hour=(\d{2})/minute=(\d{2})") + .expect("partition datetime regex is valid") +}); use sha2::{Digest, Sha256}; pub fn get_node_id() -> String { @@ -49,8 +58,7 @@ pub fn get_node_id() -> String { } pub fn extract_datetime(path: &str) -> Option { - let re = Regex::new(r"date=(\d{4}-\d{2}-\d{2})/hour=(\d{2})/minute=(\d{2})").unwrap(); - if let Some(caps) = re.captures(path) { + if let Some(caps) = PARTITION_DATETIME_RE.captures(path) { let date_str = caps.get(1)?.as_str(); let hour_str = caps.get(2)?.as_str(); let minute_str = caps.get(3)?.as_str();