From c68e71974ebd3642ab0ee416431877225c24aad5 Mon Sep 17 00:00:00 2001 From: Baptiste Fernandez Date: Fri, 3 Apr 2026 11:16:26 +0100 Subject: [PATCH] improve data-analysis skill structure + documentation --- examples/agents/skills/data-analysis/SKILL.md | 133 +++++++++++++++--- 1 file changed, 110 insertions(+), 23 deletions(-) diff --git a/examples/agents/skills/data-analysis/SKILL.md b/examples/agents/skills/data-analysis/SKILL.md index 0352cfdc..f7f1f020 100644 --- a/examples/agents/skills/data-analysis/SKILL.md +++ b/examples/agents/skills/data-analysis/SKILL.md @@ -1,29 +1,116 @@ --- name: data-analysis -description: Analyze datasets, generate charts, and create summary reports. Use when the user needs to work with CSV, Excel, or other tabular data formats for analysis or visualization. +description: > + Analyze datasets, generate charts, and create summary reports from CSV, Excel, + JSON, Parquet, or other tabular data. Capabilities: statistical profiling, + outlier detection, pivot tables, groupby aggregation, time-series analysis, + correlation matrices, and publication-ready visualizations. + Use when the user asks to analyze, visualize, profile, or summarize tabular + data, or mentions CSV, Excel, charts, statistics, EDA, histograms, scatter + plots, dashboards, or exploratory analysis. --- # Data Analysis -## When to use this skill -Use this skill when the user needs to: -- Analyze CSV or Excel files -- Generate charts and visualizations -- Calculate statistics and summaries -- Clean and transform data - -## How to analyze data -1. Use pandas for data analysis: - ```python - import pandas as pd - df = pd.read_csv('data.csv') - summary = df.describe() - ``` - -## How to create visualizations -1. Use matplotlib or seaborn for charts: - ```python - import matplotlib.pyplot as plt - df.plot(kind='bar') - plt.savefig('chart.png') - ``` +## Workflow + +Follow these steps in order. Do not skip validation checkpoints. + +### Step 1: Load and validate + +Select the reader based on file format: + +| Format | Reader | +|--------|--------| +| CSV | `pd.read_csv(path, parse_dates=True)` | +| Excel | `pd.read_excel(path, sheet_name=0)` | +| JSON | `pd.read_json(path)` | +| Parquet | `pd.read_parquet(path)` | + +For CSV files, wrap in a try/except to handle encoding issues (fall back to `encoding='latin-1'`). + +```python +import pandas as pd + +try: + df = pd.read_csv('data.csv', parse_dates=True) +except UnicodeDecodeError: + df = pd.read_csv('data.csv', encoding='latin-1', parse_dates=True) + +assert not df.empty, "Dataset is empty — verify the file path and format." +print(f"Shape: {df.shape}\nColumns: {list(df.columns)}\nDtypes:\n{df.dtypes}") +``` + +### Step 2: Profile and clean + +```python +print(df.describe(include='all')) + +missing = df.isnull().sum() +print(f"Missing values:\n{missing[missing > 0]}") + +for col in df.select_dtypes('object'): + converted = pd.to_numeric(df[col], errors='coerce') + if converted.notna().sum() > len(df) * 0.5: + coerced = df[col][converted.isna() & df[col].notna()] + print(f"Coerced {len(coerced)} non-numeric values in '{col}'") + df[col] = converted + +for col in df.select_dtypes('number'): + q1, q3 = df[col].quantile([0.25, 0.75]) + iqr = q3 - q1 + outliers = ((df[col] < q1 - 1.5 * iqr) | (df[col] > q3 + 1.5 * iqr)).sum() + if outliers > 0: + print(f"Column '{col}': {outliers} outliers detected") +``` + +### Step 3: Transform and aggregate + +```python +summary = df.groupby('category')['value'].agg(['mean', 'median', 'std', 'count']) +print(summary) + +pivot = df.pivot_table(values='revenue', index='region', columns='quarter', aggfunc='sum') + +ts = df.set_index('date')['value'].resample('M').mean() # time-series resampling +``` + +### Step 4: Visualize and save + +```python +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt +import seaborn as sns + +fig, axes = plt.subplots(2, 2, figsize=(14, 10)) +df['value'].hist(ax=axes[0, 0], bins=30, edgecolor='black') +axes[0, 0].set_title('Distribution of Value') +df.groupby('region')['revenue'].sum().plot.bar(ax=axes[0, 1], title='Revenue by Region') +sns.heatmap(df.select_dtypes('number').corr(), annot=True, fmt='.2f', ax=axes[1, 0]) +axes[1, 0].set_title('Correlation Matrix') +df.groupby('date')['value'].mean().plot(ax=axes[1, 1], title='Trend Over Time') +plt.tight_layout() +plt.savefig('eda_report.png', dpi=150) +print("Chart saved to eda_report.png") +``` + +If chart rendering fails, fall back to a text summary table. + +### Step 5: Report findings + +Print a plain-language summary covering: +- Dataset shape and completeness (rows, columns, missing %) +- Key statistics (means, medians, notable distributions) +- Outliers or data quality issues found +- Patterns observed (correlations, group differences, trends) + +## Error recovery + +| Problem | Action | +|---------|--------| +| File not found | List directory contents with `os.listdir()`, ask user to confirm filename | +| Encoding error | Retry with `encoding='latin-1'` then `'cp1252'` | +| Mixed dtypes in column | Use `pd.to_numeric(col, errors='coerce')`, report coerced rows | +| Empty dataframe after filter | Warn user, show `value_counts()` for the filter column | +| Large dataset (>1M rows) | Use `df.sample(n=10000)` for profiling, full data for aggregation |