From b04f10ce2d4f174f431e8dbbbea7b75b49e04c82 Mon Sep 17 00:00:00 2001 From: Minoru Osuka Date: Mon, 18 Mar 2019 08:51:09 +0900 Subject: [PATCH 1/2] Add Wikipedia example --- README.md | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/README.md b/README.md index 6040c11..6ca8450 100644 --- a/README.md +++ b/README.md @@ -578,3 +578,46 @@ You can execute the command in docker container as follows: ```bash $ docker exec -it blast-index1 blast-index node --grpc-addr=:5050 ``` + + +## Wikipedia example + +This section explain how to index Wikipedia dump to Blast. + + +### Download wikipedia dump + +```bash +$ curl -o ~/tmp/enwiki-20190101-pages-articles.xml.bz2 https://dumps.wikimedia.org/enwiki/20190101/enwiki-20190101-pages-articles.xml.bz2 +``` + + +### Install wikiextractor + +```bash +$ cd ${HOME} +$ git clone git@github.com:attardi/wikiextractor.git +``` + + +### Parsing wikipedia dump + +```bash +$ cd wikiextractor +$ ./WikiExtractor.py -o ~/tmp/enwiki --json ~/tmp/enwiki-20190101-pages-articles.xml.bz2 +``` + + +### Indexing wikipedia dump + +```bash +$ for FILE in $(find ~/tmp/enwiki -type f -name '*' | sort) + do + cat ${FILE} | while read -r LINE; do + TIMESTAMP=$(date -u "+%Y-%m-%dT%H:%M:%SZ") + ID=$(echo ${LINE} | jq -r .id) + FIELDS=$(echo ${LINE} | jq -c -r '{url: .url, title_en: .title, text_en: .text, timestamp: "'${TIMESTAMP}'"}') + curl -X PUT "http://127.0.0.1:8080/documents/${ID}" -d "${FIELDS}" + done + done +``` From 26223cf1a9c0e4d8e39be6c7883cab06ac7c2199 Mon Sep 17 00:00:00 2001 From: Minoru Osuka Date: Mon, 18 Mar 2019 08:52:55 +0900 Subject: [PATCH 2/2] Update CHANGES.md --- CHANGES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.md b/CHANGES.md index 38500ee..62305b8 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/). ### Added +- Add Wikipedia example #35 - Support cznicb and leveldb #34 - Add CHANGES.md #29 - Add error handling for server startup #28.