diff --git a/data-loading/Dockerfile b/data-loading/Dockerfile index 331b9fcc..a74ad5bf 100644 --- a/data-loading/Dockerfile +++ b/data-loading/Dockerfile @@ -71,8 +71,9 @@ RUN pip3 install -r requirements.txt COPY --chown=nru setup-and-load-solr.sh ${ROOT} COPY --chown=nru README.md ${ROOT} COPY --chown=nru Makefile ${ROOT} +COPY --chown=nru conf/ ${ROOT}/conf/ # On entry, start the Solr instance. ENV SOLR_EXEC="${ROOT}/solr/solr-${SOLR_VERSION}/bin/solr" ENV SOLR_DIR="$SOLR_DIR" -ENTRYPOINT ${SOLR_EXEC} -cloud -f -p 8983 -m 64G -s ${SOLR_DIR} +ENTRYPOINT ${SOLR_EXEC} -f -p 8983 -m 64G -s ${SOLR_DIR} diff --git a/data-loading/Makefile b/data-loading/Makefile index e0028bfd..92f61cee 100644 --- a/data-loading/Makefile +++ b/data-loading/Makefile @@ -19,9 +19,8 @@ SOLR_MEM=220G .PHONY: all clean all: data/setup.done echo Solr has now been set up and loaded with the synonym data. - echo Run 'make start-solr-backup' to start a backup. Run 'make check-solr-backup' to check - echo if the backup has completed. Once that has completed, run 'make data/backup.done' to - echo generate a snapshot.backup.tar.gz file that can be used in NameRes. + echo Run 'make data/backup.done' to stop Solr and create the solr-data.tar.gz backup. + echo Copy data/solr-data.tar.gz to a web server for NameRes deployment. clean: rm -rf data/* @@ -41,10 +40,10 @@ data/synonyms/done: echo Split DrugChemicalConflated.txt and GeneProteinConflated.txt, and deleted the original files. touch $@ -# Step 3. Start Solr server. +# Step 3. Start Solr server (standalone mode, no ZooKeeper). data/solr.pid: mkdir -p ${SOLR_DIR}/logs - ${SOLR_EXEC} -cloud -p 8983 -v -m ${SOLR_MEM} -s ${SOLR_DIR} >> ${SOLR_DIR}/logs/solr.txt 2>> ${SOLR_DIR}/logs/solr.err.txt + ${SOLR_EXEC} -p 8983 -v -m ${SOLR_MEM} -s ${SOLR_DIR} >> ${SOLR_DIR}/logs/solr.txt 2>> ${SOLR_DIR}/logs/solr.err.txt while [ ! -s $@ ]; do \ ${SOLR_EXEC} status | grep -Po 'Solr process \K([0-9]+)' > $@; \ done @@ -56,28 +55,22 @@ data/setup.done: data/synonyms/done data/solr.pid mkdir -p data/logs bash setup-and-load-solr.sh "data/synonyms/*.txt*" >> data/logs/setup-and-load-solr.sh.log 2>> data/logs/setup-and-load-solr.sh.err.log && touch $@ -# Step 5. Start a Solr backup. -.PHONY: start-solr-backup -start-solr-backup: data/setup.done - curl 'http://localhost:8983/solr/name_lookup/replication?command=backup&name=backup' - -# Step 6. Wait for the backup to complete. -.PHONY: check-solr-backup -check-solr-backup: - curl 'http://localhost:8983/solr/name_lookup/replication?command=details' - -# Step 6. Shutdown the Solr instance. -### data/stop-solr: -### docker exec name_lookup solr stop -p 8983 -verbose - -# Step 7. Generate the backup tarball. -data/backup.done: - mkdir -p data/var/solr/data - mv /var/solr/name_lookup_shard1_replica_n1/data/snapshot.backup data/var/solr/data - cd data && tar zcvf snapshot.backup.tar.gz var && touch backup.done +# Step 5. Create backup: stop Solr cleanly, install read-only config, tar core directory. +# The tarball contains name_lookup/ with conf/ and data/index/ (no tlog). +# To restore: extract into ./data/solr/ and run docker compose up. +data/backup.done: data/setup.done + $(info Stopping Solr before backup...) + ${SOLR_EXEC} stop + rm -f data/solr.pid + $(info Installing read-only Solr config for deployment...) + cp conf/solrconfig.xml ${SOLR_DIR}/name_lookup/conf/solrconfig.xml + $(info Creating backup tarball at data/solr-data.tar.gz ...) + cd ${SOLR_DIR} && tar zcvf $(CURDIR)/data/solr-data.tar.gz --exclude='name_lookup/data/tlog' name_lookup + touch data/backup.done + $(info Backup complete. Copy data/solr-data.tar.gz to a web server for deployment.) .PHONY: stop-solr stop-solr: - rm data/solr.pid + rm -f data/solr.pid ${SOLR_EXEC} stop $(info Solr stopped.) diff --git a/data-loading/README.md b/data-loading/README.md index 376c9e9f..58d4a8d7 100644 --- a/data-loading/README.md +++ b/data-loading/README.md @@ -7,31 +7,29 @@ Solr, a Solr backup created, and then compressed with a particular directory str ## Using the Makefile This directory includes a Makefile that can be used to run most of these steps -automatically. This is a seven-step process: +automatically. This is a five-step process: 1. Edit the Makefile to choose the directory containing Babel synonym files. Note that - all files in that directory will be used, and any files named `.txt.gz` will uncompressed. + all files in that directory will be used, and any files named `.txt.gz` will be uncompressed. 2. Run `make all` to download the synonym data, uncompress Gzipped files, split the larger files and delete the split files to avoid duplicate loading. `make all` will also start the Solr server -- you can check this by looking for a PID file Solr in `data/solr.pid`. 3. (Optional) Access the Solr server and confirm that all the data has been loaded. -4. Run `make start-solr-backup` to start the Solr backup. -5. Run `make check-solr-backup` to check on the Solr backup. Look for `"status":"success"` to confirm that the - backup has completed. -6. Run `make data/backup.done` to move the backup into the `data/` directory, place it in the correct directory - structure for NameRes, and create a `snapshot.backup.tar.gz` file. -7. Copy the `snapshot.backup.tar.gz` file to a web server so that it can be loaded from NameRes. +4. Run `make data/backup.done` to stop Solr, install the read-only Solr config, and create + `data/solr-data.tar.gz` containing the complete `name_lookup/` core directory. +5. Copy `data/solr-data.tar.gz` to a web server so that it can be loaded from NameRes. ## Step-by-step instructions 1. Set up a Solr server locally. The easiest way to do this is via Docker: ```shell - $ docker run -v "$PWD/data/solrdata:/var/solr" --name name_lookup -p 8983:8983 -t solr -cloud -p 8983 -m 12G + $ docker run -v "$PWD/data/solrdata:/var/solr/data" --name name_lookup -p 8983:8983 -t solr -p 8983 -m 12G ``` - + You can adjust the `12G` to increase the amount of memory available to Solr. You can also add `-d` to the - Docker arguments if you would like to run this node in the background. + Docker arguments if you would like to run this node in the background. Note: Solr runs in standalone mode + (no `-cloud` flag), so the data directory is `/var/solr/data/` and cores are stored directly under it. 2. Copy the synonym files into the `data/synonyms` directory. Synonym files that are too large will need to split it into smaller files. (`gsplit` is the GNU version of `split`, which includes support @@ -42,7 +40,7 @@ automatically. This is a seven-step process: $ gsplit -l 5000000 -d --additional-suffix .txt MolecularMixture.txt MolecularMixture ``` -3. Download all the synonym text files into the `data/json` folder. You can download this by running `make`. +3. Download all the synonym text files into the `data/synonyms` folder. You can download this by running `make`. ```shell $ pip install -r requirements.txt @@ -52,54 +50,30 @@ automatically. This is a seven-step process: 4. Load the JSON files into the Solr database by running: ```shell - $ ./setup-and-load-solr.sh "data/json/*.json" + $ ./setup-and-load-solr.sh "data/synonyms/*.txt*" ``` - - Note the double-quotes: setup-and-load-solr.sh requires a glob pattern as its first argument, not a list of files to process! - -5. Generate a backup of the Solr instance. The first command will create a directory at - `solrdata/data/name_lookup_shard1_repical_n1/data/snapshot.backup` -- you can track its progress by comparing the - number of files in that directory to the number of files in `../data/index` (as I write this, it has 513 files). - ```shell - $ curl 'http://localhost:8983/solr/name_lookup/replication?command=backup&name=backup' - $ curl 'http://localhost:8983/solr/name_lookup/replication?command=details' - ``` - - Once the backup is complete, you'll see a part of the `details` response that looks like this: - - ```json - "backup":{ - "startTime":"2022-09-13T18:42:43.678219123Z", - "fileCount":512, - "indexFileCount":512, - "status":"success", - "snapshotCompletedAt":"2022-09-13T19:36:00.599797304Z", - "endTime":"2022-09-13T19:36:00.599797304Z", - "snapshotName":"backup", - "directoryName":"snapshot.backup" - } - ``` + Note the double-quotes: setup-and-load-solr.sh requires a glob pattern as its first argument, not a list of files to process! -6. Shutdown the Solr instance. +5. Stop Solr and generate the backup tarball. This stops Solr cleanly, installs the + read-only Solr config, and tars the complete `name_lookup/` core directory (including schema + and index data, excluding the write-ahead log): ```shell $ docker exec name_lookup solr stop -p 8983 -verbose + $ cd data/solrdata + $ tar zcvf ../solr-data.tar.gz --exclude='name_lookup/data/tlog' name_lookup ``` - -7. Generate the backup tarball. At the moment, this is expected to be in the format - `var/solr/data/snapshot.backup/[index files]`. The easiest way to generate this tarball correctly is to run: - ```shell - $ mkdir -p data/var/solr/data - $ mv /var/solr/name_lookup_shard1_replica_n1/data/snapshot.backup data/var/solr/data - $ cd data - $ tar zcvf snapshot.backup.tar.gz var - ``` + The tarball contains `name_lookup/` with `conf/` (schema + config) and `data/index/` + (Lucene index). It is fully self-contained: extract it and Solr is ready to serve queries + with no restore step. -8. Publish `snapshot.backup.tar.gz` to a publicly-accessible URL. +6. Publish `solr-data.tar.gz` to a publicly-accessible URL. -9. Use the instructions at https://github.com/helxplatform/translator-devops/tree/develop/helm/name-lookup to set up an - instance of NameRes that downloads snapshot.backup.tar.gz from this publicly-accessible URL. +7. Use the instructions at https://github.com/helxplatform/translator-devops/tree/develop/helm/name-lookup to set up an + instance of NameRes that downloads `solr-data.tar.gz` from this publicly-accessible URL. + Note that the Helm chart restore step is no longer required — the tarball is extracted + directly to the Solr data volume and Solr starts ready to serve queries. -The Makefile included in this directory contains targets for more of these steps. +The Makefile included in this directory contains targets for most of these steps. diff --git a/data-loading/conf/managed-schema.xml b/data-loading/conf/managed-schema.xml new file mode 100644 index 00000000..6f831366 --- /dev/null +++ b/data-loading/conf/managed-schema.xml @@ -0,0 +1,34 @@ + + + + id + diff --git a/data-loading/conf/solrconfig.xml b/data-loading/conf/solrconfig.xml new file mode 100644 index 00000000..a298f212 --- /dev/null +++ b/data-loading/conf/solrconfig.xml @@ -0,0 +1,51 @@ + + + + 9.0 + + + + false + managed-schema.xml + + + + + + + + explicit + 10 + + + + + + explicit + 10 + + + + + + solrpingquery + + + all + + server-enabled.txt + + + diff --git a/data-loading/setup_solr.sh b/data-loading/setup_solr.sh index 0ea2842f..f7868bc1 100644 --- a/data-loading/setup_solr.sh +++ b/data-loading/setup_solr.sh @@ -13,10 +13,10 @@ # require SOLR_SERVER : "${SOLR_SERVER:?SOLR_SERVER must be set}" -echo "Setting up Solr database with SOLR_SERVER='$SOLR_SERVER'" +echo "Setting up Solr database (standalone mode) with SOLR_SERVER='$SOLR_SERVER'" -# add collection -curl -X POST "$SOLR_SERVER/solr/admin/collections?action=CREATE&name=name_lookup&numShards=1&replicationFactor=1" +# add core (standalone mode: uses _default configset) +curl "$SOLR_SERVER/solr/admin/cores?action=CREATE&name=name_lookup&configSet=_default" # do not autocreate fields curl "$SOLR_SERVER/solr/name_lookup/config" -d '{"set-user-property": {"update.autoCreateFields": "false"}}' @@ -94,7 +94,7 @@ curl -X POST -H 'Content-type:application/json' --data-binary '{ { "name":"types", "type":"string", - "stored":true + "stored":true, "multiValued":true }, { diff --git a/docker-compose.yml b/docker-compose.yml index ba710749..ecfa471e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -9,8 +9,6 @@ services: SOLR_JAVA_MEM: '-Xmx16G' ports: - '8983:8983' - command: ['-DzkRun'] - # Solr needs to store its data somewhere. It defaults to `./data`, but you can reconfigure this to any # directory you want. volumes: diff --git a/documentation/Deployment.md b/documentation/Deployment.md index d4ba344e..68478b45 100644 --- a/documentation/Deployment.md +++ b/documentation/Deployment.md @@ -17,15 +17,18 @@ instance or from Translator. with [Docker Compose](https://docs.docker.com/compose/install/). 2. Create the local directory where your Solr data will be stored -- by default, this is `./data/solr` in this directory, but you can change this in - [docker-compose.yml](./docker-compose.yml). This directory will need to have a maximum - storage of approx 400G: 104G of the downloaded file (which can be deleted once decompressed), - 147G of uncompressed backup (both of which can be deleted once restored) and 147G of - Apache Solr databases. + [docker-compose.yml](./docker-compose.yml). This directory will need approximately 250G + of storage: ~100G for the downloaded backup file (which can be deleted once extracted) + and ~150G for the extracted Solr core directory. 3. Download the Solr backup URL you want to use and save it in `./data/solr`. It should be - approximately 104G in size. -4. Uncompress the Solr backup file. It should produce a `var/solr/data/snapshot.backup` directory - in the Solr data (by default, `./data/solr/var/solr/data/snapshot.backup`). You can delete - the downloaded file (`snapshot.backup.tar.gz`) once it has been decompressed. + approximately 100G in size. +4. Extract the Solr backup into `./data/solr`: + ``` + cd ./data/solr && tar zxvf solr-data.tar.gz + ``` + This produces a `name_lookup/` directory (the complete Solr core, including schema and + index data). You can delete the downloaded file (`solr-data.tar.gz`) once it has been + extracted. 5. Check the [docker-compose.yml](./docker-compose.yml) file to ensure that it is as you expect. * The Docker Compose file will use the latest released version of NameRes @@ -33,34 +36,25 @@ instance or from Translator. the build instructions for the `nameres` service in the Docker Compose file. * Solr will be given 16G of memory, which seems sufficient for testing. If you want to run many Solr queries, you might want to increase this. To do this, - you will need to change BOTH the `mem_limit` setting in the `nameres_solr` service in + you will need to change BOTH the `mem_limit` setting in the `nameres_solr` service in `docker-compose.yml` and the `SOLR_JAVA_MEM` setting. - * The `docker-compose.yml` file also mounts the local `data/` directory into the Solr - container as `/var/solr`. This will allow you to start a new NameRes from the same + * The `docker-compose.yml` file also mounts the local `data/solr` directory into the Solr + container as `/var/solr/data`. This will allow you to start a new NameRes from the same directory in the future. If you want to use a different directory, please change the `volumes` setting in the `nameres_solr` service in `docker-compose.yml`. Removing the binding will cause the Solr data to be stored in the Docker instance, and the data will be lost when the container is stopped. 6. Start the Solr and NameRes pods by running `docker compose up`. By default, Docker Compose - will download and start the relevant pods and show you logs from both sources. You may - press `Ctrl+C` to stop the pods. -7. Trigger the Solr restore by running the restore script using `bash`, i.e. - `bash solr-restore/restore.sh`. This script assumes that the Solr pod is available on `localhost:8983` - and contains a `var/solr/data/snapshot.backup` directory with the data to restore. It will set up - some data types needed by NameRes and then triggering a restore of a backup. It will then go into a - sleep loop until the restore is complete, which should take 15-20 minutes. -8. Check that the script ended properly (`Solr restore complete!`). Look up http://localhost:2433/status - to ensure that the database has been loaded as expected. You can now delete the uncompressed database - backup in `$SOLR_DATA/var` to save disk space. -9. With the default settings, NameRes should be running on localhost on port 2433 (i.e. http://localhost:2433/). + will download and start the relevant pods and show you logs from both sources. Solr will + find the extracted `name_lookup/` core and be ready immediately — no separate restore step + is required. You may press `Ctrl+C` to stop the pods. +7. With the default settings, NameRes should be running on localhost on port 2433 (i.e. http://localhost:2433/). + Look up http://localhost:2433/status to confirm that the database has been loaded as expected. You should see a message in the NameRes pod log saying something like `Uvicorn running on http://0.0.0.0:2433 (Press CTRL+C to quit)` to confirm this. * By default, the web frontend (http://0.0.0.0:2433/docs) defaults to using the [NameRes RENCI Dev](https://name-resolution-sri.renci.org/docs) — you will need to change the "Servers" setting to use your local NameRes instance. - * If you try this before the restore has finished, looking up http://0.0.0.0:2433/status will give you an error - (`Expected core not found.`). This is because the Solr database and indexes have not yet been loaded. - Once this is finished, the NameRes instance should be ready to use. #### Loading from synonyms files diff --git a/solr-restore/README.md b/solr-restore/README.md deleted file mode 100644 index 9da72dfd..00000000 --- a/solr-restore/README.md +++ /dev/null @@ -1,12 +0,0 @@ -# solr-restore - -This directory contains a script that can be used to restore a local Apache Solr backup to a Solr database -in Docker along with the indexes needed to query them from NameRes. It assumes that the backup is present -on the Solr server in the Solr data directory (by default `./data/solr`) and is -named `snapshot.backup.tar.gz`. If you follow the instructions in [the main README file](../README.md), -this script will be used automatically. - -It is essentially the same script as is included in -[the name-lookup Helm chart](https://github.com/helxplatform/translator-devops/tree/develop/helm/name-lookup) -of the `translator-devops` repository, but with some modifications allowing the script to be used -locally. diff --git a/solr-restore/restore.sh b/solr-restore/restore.sh deleted file mode 100644 index 4bc6133c..00000000 --- a/solr-restore/restore.sh +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env bash -# -# restore.sh -# -# Restores a Solr backup located in the Solr data directory (`$SOLR_DATA/var/solr/data/snapshot.backup`). -# -# To do this, it must: -# - Initiate the restore. -# - Wait until the restore has completed. -# - Create the necessary fields (hopefully we can make this unnecessary, see https://github.com/TranslatorSRI/NameResolution/issues/185) -# -# This script should only require the `wget` program. -# -# TODO: This script does not currently implement any Blocklists. - -# We don't use set -e because the loop test relies on failures being ignored. -set -uo pipefail - -# Configuration options -SOLR_SERVER="http://localhost:8983" -SLEEP_INTERVAL=60 - -# Please don't change these values unless you change NameRes appropriately! -COLLECTION_NAME="name_lookup" -BACKUP_NAME="backup" - -# Step 0. Make sure the Solr data directory looks like it contains the uncompressed backup. -if [ ! -d "./data/solr/var" ]; then - echo 'WARNING: No ./data/solr/var directory found; are you sure you uncompressed the NameRes backup into the Solr data directory?' >&2 -fi - -# Step 1. Make sure the Solr service is up and running. -HEALTH_ENDPOINT="${SOLR_SERVER}/solr/admin/cores?action=STATUS" -response=$(wget --spider --server-response ${HEALTH_ENDPOINT} 2>&1 | grep "HTTP/" | awk '{ print $2 }') >&2 -until [ "$response" = "200" ]; do - response=$(wget --spider --server-response ${HEALTH_ENDPOINT} 2>&1 | grep "HTTP/" | awk '{ print $2 }') >&2 - echo " -- SOLR is unavailable - sleeping" - sleep 3 -done -echo "SOLR is up and running at ${SOLR_SERVER}." - -# Step 2. Create fields for search. -SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" -source "$SCRIPT_DIR/../data-loading/setup_solr.sh" -echo Solr database has been set up. - -# Step 3. Restore the data -CORE_NAME="${COLLECTION_NAME}_shard1_replica_n1" -echo "Starting Solr restore on core ${CORE_NAME}, with status at ${SOLR_SERVER}/solr/${CORE_NAME}/replication?command=restorestatus" -RESTORE_URL="${SOLR_SERVER}/solr/${CORE_NAME}/replication?command=restore&location=/var/solr/data/var/solr/data/&name=${BACKUP_NAME}" -wget -O - "$RESTORE_URL" -sleep "$SLEEP_INTERVAL" -RESTORE_STATUS_URL="${SOLR_SERVER}/solr/${CORE_NAME}/replication?command=restorestatus" -RESTORE_STATUS=$(wget -q -O - "$RESTORE_STATUS_URL" 2>&1 | grep "success") -RESTORE_STATUS="" -until [ -n "$RESTORE_STATUS" ] ; do - echo "Solr restore in progress. If this takes longer than 30 minutes, please visit ${SOLR_SERVER} with your browser to check Solr." - RESTORE_STATUS=$(wget -q -O - "$RESTORE_STATUS_URL" 2>&1 | grep "success") - sleep "$SLEEP_INTERVAL" -done -echo "Solr restore complete!" - -echo "Solr contents:" -curl -s --negotiate -u: "$SOLR_SERVER/solr/name_lookup/query?q=*:*&rows=0"