Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,17 @@
.ipynb_checkpoints/
*/*.egg-info/*
.idea/

build/
logs/
__pycache__/
docs/
cache/
data/
datalake/
plugins/

minio/
neo4j/
neo4j_data/
postgress_storage/
storage/
chromadb/
postgres/

15 changes: 15 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
FROM apache/airflow:2.10.0

# Set the working directory
WORKDIR /app

# Switch to airflow user to run the application
USER airflow

# Copy the requirements file and install dependencies
COPY requirements.txt .

RUN pip install --no-cache-dir -r requirements.txt

# Set the entrypoint to Airflow
ENTRYPOINT ["airflow"]
115 changes: 106 additions & 9 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,21 +1,118 @@
.PHONY: server etl test
.PHONY: setup install start stop profile server etl test build clean

# Default port for the server
PORT ?= 8000
.PHONY: install
install:
@echo "Installing requirements..."
@pip install -r requirements.txt

# Path for ETL documents
ETL_PATH ?= /path/to/docspecs
FORCE ?= true
.PHONY: setup
setup: install
@if [ ! -f .env ]; then \
echo "Generating .env file with FERNET_KEY..."; \
echo "Generating .env file with FERNET_KEY..."; \
python3 -c "from cryptography.fernet import Fernet; \
fernet_key = Fernet.generate_key().decode(); \
template = 'FERNET_KEY={fernet_key}\\nAIRFLOW_UID=50000'; \
print(template.format(fernet_key=fernet_key)); \
print('_AIRFLOW_WWW_USER_USERNAME=airflow'); \
print('_AIRFLOW_WWW_USER_PASSWORD=airflow'); " > .env; \
echo ".env file generated."; \
else \
echo ".env file already exists. Skipping FERNET_KEY generation."; \
fi
@if [ ! -f config/local/.env ]; then \
echo "Copying .env file to config/local..."; \
cp .env config/local/; \
echo ".env file copied."; \
else \
echo "config/local/.env file already exists. Skipping config/local copy."; \
fi
@echo "Launching minio..."
@docker-compose up -d minio
@echo "Waiting for minio to start..."
@until docker-compose exec minio mc ready local; do \
echo "Minio is not healthy yet. Retrying in 5 seconds..."; \
sleep 5; \
done
@echo "Setting up local alias..."
@docker-compose exec minio mc alias set minio http://localhost:9000 minioadmin minioadmin
@echo "Checking if bucket 'legal' exists..."
@if ! docker-compose exec minio mc ls minio/legal; then \
echo "Creating bucket 'legal'..."; \
docker-compose exec minio mc mb minio/legal; \
else \
echo "Bucket 'legal' already exists. Skipping creation."; \
fi
@echo "Checking if bucket 'airflow-logs' exists..."
@if ! docker-compose exec minio mc ls minio/airflow-logs; then \
echo "Creating bucket 'airflow-logs'..."; \
docker-compose exec minio mc mb minio/airflow-logs; \
else \
echo "Bucket 'airflow-logs' already exists. Skipping creation."; \
fi
@echo "Minio setup complete. Stopping minio..."
@docker-compose stop minio
@echo "Initializing Airflow..."
@docker-compose up -d airflow-webserver
@echo "Waiting for Airflow to start..."
@until docker-compose exec airflow-webserver airflow db check; do \
echo "Airflow is not healthy yet. Retrying in 5 seconds..."; \
sleep 5; \
done
@echo "Creating Airflow user..."
@docker-compose exec airflow-webserver airflow users create -u airflow -p airflow -r Admin --verbose -f air -l flow -e airflow@airflow.air
@if docker-compose exec airflow-webserver airflow connections get minio; then \
echo "Connection 'minio' already exists. Skipping creation."; \
else \
echo "Creating connection 'minio'..."; \
docker-compose exec airflow-webserver airflow connections add --conn-login minioadmin --conn-password minioadmin --conn-host minio --conn-port 9000 --conn-schema http --conn-extra '{"endpoint_url": "http://minio:9000"}' --conn-type aws minio; \
fi
@echo "Stopping Airflow..."
@docker-compose stop airflow-webserver
@echo "Setup complete."

.PHONY: start
start:
@echo "Detecting virtual environment..."
@if [ -n "$$VIRTUAL_ENV" ]; then \
echo "Virtual environment detected at $$VIRTUAL_ENV"; \
export VIRTUAL_ENV_PATH=$$VIRTUAL_ENV; \
else \
echo "No virtual environment detected."; \
fi
@echo "Starting up the microservices..."
@docker-compose up -d
@echo "Done."
@echo "\nFrontends are available at the following links:"
@echo "ChromaDB: http://localhost:3000/collections/legal-database"
@echo "Neo4j: http://localhost:7474"
@echo "Minio: http://localhost:9000"
@echo "Airflow: http://localhost:8080"

.PHONY: stop
stop:
@echo "Stopping the microservices..."
@docker-compose down


.PHONY: profile
profile:
@py-spy record -o profile.svg -- python dags/jurisprudencia.py

# Docker build
.PHONY: build
build:
@docker build -t semantic_airflow .

# Run the server
server:
@echo "Running the server on port $(PORT)..."
@semantic server --port $(PORT)
@verdictnet server --port $(PORT)

# Run the ETL pipeline
etl:
@echo "Running the ETL pipeline with path $(ETL_PATH) and force $(FORCE)..."
@semantic etl run --path $(ETL_PATH) --force $(FORCE)
@verdictnet etl run --path $(ETL_PATH) --force $(FORCE)

# Run tests
test:
Expand All @@ -25,4 +122,4 @@ test:
# Clean the vector database
clean:
@echo "Cleaning the vector database..."
@semantic etl clean
@verdictnet etl clean
51 changes: 38 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,24 +1,49 @@
# Semantic Graph Search Project
# VerdictNet: Legal Semantic Search Engine

This project is a semantic graph search system designed to manage and query data.
Currently, the interface is a simple command-line interface (CLI) tool and a web server.
The system supports data ingestion, cleaning, querying, and running a server for frontend interactions.


## Quick Start
You should be able to kickstart the project by launching the docker compose setup and then starting the server:
## Development Quick Start
It is strongly recommended to use a virtual environment to run the project. You should be able to kickstart the project by running the following commands:
```sh
$ docker-compose up
$ make setup
```
This will launch the following services:

This will
- Install the package requirements in the current python environment (python 3.12 recommended)
- Create an `.env` file with the necessary environment variables if it does not exist. Copy this `.env` file to the `config/local` directory if it does not already exist.
- Create the `datalake` and `airflow-logs` buckets in the Minio object storage.
- Create the `airflow` user in Airflow.
- Create the `minio` Airflow connection.
- Install development dependencies.

After this, you can start the development environment by running:
```sh
$ make start
```
The first launch will take some time because it will build the docker images.

When done, the following services will be up and running:
- [ChromaDB Browser: `http://localhost:3000/collections/legal-database`](http://localhost:3000/collections/legal-database). This is the vector Database used to run semantic queries.
- [Neo4J Browser: `http://localhost:7474`](http://localhost:7474). This is a GUI to the graph database that will hold the relationships between the different documents indexed in the ChromaDB.
- [Airflow: `http://localhost:8080`](http://localhost:8080). This is the scheduler used to run daily data mining tasks.
- [Minio Console: `http://localhost:9001`](http://localhost:9001). This is the object storage used to store the documents in local develpment envs.

The Postgress database is used by Airflow and is persisted to a `postgress_service`. This is useful if you want to do a clean start and not lose the data in the database.



### Running the ETL pipeline
To run the ETL pipeline, you can run the following command:
```sh
$ make etl
```

Finally, run
```sh
$ semantic server
$ verdictnet server
````
to launch the frontend interface, accessible through
- [Frontend: `http://localhost:8000`](http://localhost:8000)
Expand All @@ -33,7 +58,7 @@ Run data pipelines to ingest and process documents.

#### Usage:
```sh
$ semantic etl [--path PATH] [--force FORCE] {clean,run}
$ verdictnet etl [--path PATH] [--force FORCE] {clean,run}

--path PATH: Path where to look for document specs.
--force FORCE: Force download of documents.
Expand All @@ -46,30 +71,30 @@ run: Ingest data into the vector database.
Query the data stored in the system.
Usage:
```sh
$ semantic query [--query QUERY] [--n_results N_RESULTS] [--interactive]
$ verdictnet query [--query QUERY] [--n_results N_RESULTS] [--interactive]
```
### Server
Run the server to provide a frontend interface.
Usage:
```sh
$ semantic server [-p PORT]
$ verdictnet server [-p PORT]

-p, --port PORT: Port to run the frontend on (default: 8000).
```

## Example usage
```sh
# Clean the vector database
semantic etl clean
verdictnet etl clean

# Run the ETL pipeline
semantic etl run --path /path/to/docspecs --force true
verdictnet etl run --path /path/to/docspecs --force true

# Query the data
semantic query --query "example query" --n_results 5
verdictnet query --query "example query" --n_results 5

# Run the server
semantic server --port 8080
verdictnet server --port 8080
```

## Configuration
Expand Down
4 changes: 4 additions & 0 deletions config.ini
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# This is the config file meant for running the application in the host machine
# It uses the minio storage

[storage]
type: s3
bucket: legal
Expand All @@ -17,6 +20,7 @@ user: neo4j
password: neo4jtest

[embedding]
# Lightweight, fast model
model_name_or_path: paraphrase-mpnet-base-v2
cache: cache/

Expand Down
12 changes: 12 additions & 0 deletions config/local/airflow.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[logging]
remote_logging = True
remote_base_log_folder = s3://airflow-logs
remote_log_conn_id = minio
encrypt_s3_logs = False

[webserver]
default_dag_run_display_number = 250
expose_config = True

[celery]
worker_concurrency = 2
30 changes: 30 additions & 0 deletions config/local/config.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
[storage]
type: s3
bucket: legal
collection: legal-database
raw: datalake/raw/
refined: datalake/refined/
html: datalake/html/

[chroma]
type: http
host: chromadb
port: 8000

[neo4j]
url: bolt://neo4j:7687
user: neo4j
password: neo4jtest

[embedding]
model_name_or_path: paraphrase-mpnet-base-v2
# use this because this is the mounting point in the docker compose
cache: /cache

[rag]
n_results: 5

[s3]
key = minioadmin
secret = minioadmin
endpoint_url = http://minio:9000
Loading