Compare commits

...

2 Commits

Author SHA1 Message Date
f53a797169
feat: wikipedia data dump working 2024-08-08 02:21:53 +01:00
8dec198afe
fix: wikipedia data dump POC improvements 2024-08-07 00:21:08 +01:00
8 changed files with 99 additions and 35 deletions

5
.gitignore vendored
View File

@ -23,8 +23,9 @@ build/
bin/ bin/
cache.json cache.json
data/dump data/dump
data/sql/2-pages-inserts.sql data/sql/*
data/sql/3-internal-links-inserts.sql !data/sql/0000-tables-create.sql
!data/sql/0999-constraints.sql
# debug # debug
npm-debug.log* npm-debug.log*

23
TODO.md
View File

@ -6,13 +6,22 @@
- [x] Download SQL files - [x] Download SQL files
- [x] Extract SQL files - [x] Extract SQL files
- [x] Tables structure `CREATE TABLE` - [x] Tables structure `CREATE TABLE`
- [x] `page.sql` (`pages` tables) - [x] `page.sql` (`pages` table)
- [x] `pagelinks.sql` (`internal_links` tables) - [x] `pagelinks.sql` (`internal_links` table)
- [x] Adapt downloaded SQL files - [x] Adapt downloaded SQL files
- [x] `page.sql` (`pages` tables) - [x] `page.sql` (`pages` table)
- [x] `pagelinks.sql` (`internal_links` tables) - [x] `pagelinks.sql` (`internal_links` table)
- [ ] Import SQL files => Investigate why there is an error when importing - [x] Import SQL files
- [ ] Try `SELECT count(*) FROM internal_links il WHERE il.source_id = (SELECT p.id FROM pages p WHERE p.title = 'Linux'); -- Count of internal links for 'Linux' page` - [x] Try `SELECT count(*) FROM internal_links il WHERE il.from_page_id = (SELECT p.id FROM pages p WHERE p.title = 'Linux'); -- Count of internal links for 'Linux' page`
- [x] Try:
```sql
SELECT il.to_page_id, pl.title
FROM internal_links il
JOIN pages pl ON pl.id = il.to_page_id
WHERE il.from_page_id = (
SELECT p.id FROM pages p WHERE p.title = 'Node.js'
);
```
- [ ] Move from POC (Proof of concept) in `data` folder to `apps/cli` folder - [ ] Move from POC (Proof of concept) in `data` folder to `apps/cli` folder
- [ ] Documentation how to use + Last execution date - [ ] Documentation how to use + Last execution date
- [ ] Rewrite bash script to download and extract SQL files from Wikipedia Database Dump to Node.js for better cross-platform support and easier maintenance + automation, preferably one Node.js script to generate everything to create the database - [ ] Rewrite bash script to download and extract SQL files from Wikipedia Database Dump to Node.js for better cross-platform support and easier maintenance + automation, preferably one Node.js script to generate everything to create the database
@ -25,7 +34,7 @@
- [ ] Implement CLI (`cli`) - [ ] Implement CLI (`cli`)
- [ ] Add docs to add locale/edit translations, create component, install a dependency in a package, create a new package, technology used, architecture, links where it's deployed, how to use/install for end users, how to update dependencies with `npx taze -l` etc. - [ ] Add docs to add locale/edit translations, create component, install a dependency in a package, create a new package, technology used, architecture, links where it's deployed, how to use/install for end users, how to update dependencies with `npx taze -l` etc.
- [ ] GitHub Mirror - [ ] GitHub Mirror
- [ ] Delete `TODO.md` file and instead use issue for the remaining tasks - [ ] Delete `TODO.md` file and instead use issues for the remaining tasks
## Links ## Links

View File

@ -2,7 +2,7 @@
```sh ```sh
./download-wikipedia-dump.sh ./download-wikipedia-dump.sh
node --max-old-space-size=10096 database-wikipedia.js node --max-old-space-size=8096 database-wikipedia.js
``` ```
## Utils ## Utils
@ -13,6 +13,12 @@ Show the first 10 characters of sql file: `head -c 10 ./dump/page.sql`
To inspect volume size used by database: `docker system df -v | grep 'wikipedia-solver-mariadb-data'` To inspect volume size used by database: `docker system df -v | grep 'wikipedia-solver-mariadb-data'`
To enter in the database container: `docker exec -it wikipedia-solver-database sh`
Then: `mariadb --password="${DATABASE_PASSWORD}" --user="${DATABASE_USER}"`
And `use wikipedia;`, for example: `SELECT * FROM pages LIMIT 10;` or to execute a SQL script: `source /docker-entrypoint-initdb.d/3-internal-links-inserts.sql;`.
## Remove a volume ## Remove a volume
```sh ```sh

View File

@ -11,6 +11,7 @@ services:
MARIADB_DATABASE: ${DATABASE_NAME} MARIADB_DATABASE: ${DATABASE_NAME}
command: command:
--innodb_buffer_pool_size=4G --innodb_buffer_pool_size=4G
--key-buffer-size=4G
--innodb_log_buffer_size=256M --innodb_log_buffer_size=256M
--innodb_log_file_size=1G --innodb_log_file_size=1G
--innodb_write_io_threads=16 --innodb_write_io_threads=16

View File

@ -1,9 +1,14 @@
import fs from "node:fs" import fs from "node:fs"
import path from "node:path" import path from "node:path"
import { extractRowsFromSQLValues, swapKeysAndValues } from "./utils.js" import {
extractRowsFromSQLValues,
swapKeysAndValues,
zeroPad,
} from "./utils.js"
const SQL_DUMP_PATH = path.join(process.cwd(), "dump") const SQL_DUMP_PATH = path.join(process.cwd(), "dump")
const SQL_OUTPUT_PATH = path.join(process.cwd(), "sql") const SQL_OUTPUT_PATH = path.join(process.cwd(), "sql")
const SQL_FILENAME_NUMBER_PAD = 4
/** /**
* @typedef {Record<string, number>} WikipediaPagesKeyTitle * @typedef {Record<string, number>} WikipediaPagesKeyTitle
@ -110,7 +115,10 @@ const wikipediaPagesKeyId = await cleanPagesSQL()
const cleanPagesSQLWriteToFile = async () => { const cleanPagesSQLWriteToFile = async () => {
console.log("cleanPagesSQLWriteToFile - Writing to file...") console.log("cleanPagesSQLWriteToFile - Writing to file...")
const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "2-pages-inserts.sql") const sqlOutputPath = path.join(
SQL_OUTPUT_PATH,
`${zeroPad(1, SQL_FILENAME_NUMBER_PAD)}-pages-inserts.sql`,
)
const INSERT_INTO_START_OUTPUT = "INSERT INTO pages (id, title) VALUES " const INSERT_INTO_START_OUTPUT = "INSERT INTO pages (id, title) VALUES "
const wikipediaPagesString = Object.entries(wikipediaPagesKeyId) const wikipediaPagesString = Object.entries(wikipediaPagesKeyId)
@ -129,12 +137,6 @@ const cleanPagesSQLWriteToFile = async () => {
await cleanPagesSQLWriteToFile() await cleanPagesSQLWriteToFile()
const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "3-internal-links-inserts.sql")
const INSERT_INTO_START_OUTPUT =
"INSERT INTO internal_links (from_page_id, to_page_id) VALUES "
const sqlOutputFile = await fs.promises.open(sqlOutputPath, "w")
await sqlOutputFile.appendFile(INSERT_INTO_START_OUTPUT, "utf-8")
/** /**
* Function to clean the `pagelinks.sql` file by: * Function to clean the `pagelinks.sql` file by:
* - Removing all lines that don't start with `INSERT INTO...`. * - Removing all lines that don't start with `INSERT INTO...`.
@ -143,6 +145,10 @@ await sqlOutputFile.appendFile(INSERT_INTO_START_OUTPUT, "utf-8")
* @returns {Promise<void>} * @returns {Promise<void>}
*/ */
const cleanInternalLinksSQL = async () => { const cleanInternalLinksSQL = async () => {
let internalLinksFileCount = 2
const INSERT_INTO_START_OUTPUT =
"INSERT INTO internal_links (from_page_id, to_page_id) VALUES "
/** /**
* @type {WikipediaPagesKeyTitle} * @type {WikipediaPagesKeyTitle}
*/ */
@ -157,17 +163,27 @@ const cleanInternalLinksSQL = async () => {
let current = "" let current = ""
let lastPercent = 0 let lastPercent = 0
const BATCH_SIZE = 10_000 const BATCH_SIZE = 4_000_000
/** /**
* @type {string[]} * @type {string[]}
*/ */
let batch = [] let batch = []
const flushBatch = async (isLast = false) => { const flushBatch = async () => {
if (batch.length > 0) { if (batch.length > 0) {
const batchString = batch.join(",") + (isLast ? ";" : ",") const batchString = batch.join(",")
await sqlOutputFile.appendFile(batchString, "utf-8") const fileName = `${zeroPad(internalLinksFileCount, SQL_FILENAME_NUMBER_PAD)}-internal-links-inserts.sql`
const sqlOutputPath = path.join(SQL_OUTPUT_PATH, fileName)
await fs.promises.writeFile(
sqlOutputPath,
`${INSERT_INTO_START_OUTPUT}${batchString};`,
{
encoding: "utf-8",
},
)
console.log(`flushBatch - ${fileName}, batch.length: ${batch.length}`)
internalLinksFileCount += 1
batch = [] batch = []
} }
} }
@ -218,7 +234,7 @@ const cleanInternalLinksSQL = async () => {
if (plFromNamespace === "0" && plTargetNamespace === "0") { if (plFromNamespace === "0" && plTargetNamespace === "0") {
const toPageId = wikipediaPagesKeyTitle[plTargetTitle] const toPageId = wikipediaPagesKeyTitle[plTargetTitle]
if (toPageId != null) { if (toPageId != null && wikipediaPagesKeyId[plFromPageId] != null) {
/** /**
* @type {WikipediaInternalLink} * @type {WikipediaInternalLink}
*/ */
@ -229,19 +245,21 @@ const cleanInternalLinksSQL = async () => {
batch.push( batch.push(
`(${wikipediaInternalLink.fromPageId},${wikipediaInternalLink.toPageId})`, `(${wikipediaInternalLink.fromPageId},${wikipediaInternalLink.toPageId})`,
) )
}
}
}
if (batch.length >= BATCH_SIZE) { if (batch.length >= BATCH_SIZE) {
sqlInputFileStream.pause()
await flushBatch() await flushBatch()
} sqlInputFileStream.resume()
}
}
} }
}) })
.on("error", (error) => { .on("error", (error) => {
return reject(error) return reject(error)
}) })
.on("close", async () => { .on("close", async () => {
await flushBatch(true) await flushBatch()
console.log("cleanInternalLinksSQL - Bytes read (100%).") console.log("cleanInternalLinksSQL - Bytes read (100%).")
return resolve() return resolve()
}) })

View File

@ -5,18 +5,24 @@ CREATE TABLE `pages` (
PRIMARY KEY (`id`), PRIMARY KEY (`id`),
UNIQUE KEY (`title`) UNIQUE KEY (`title`)
) ENGINE=InnoDB AUTO_INCREMENT=76684425 DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED; ) ENGINE=MyISAM AUTO_INCREMENT=76684425 DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED;
-- VARBINARY usage instead of VARCHAR explanation: <https://stackoverflow.com/a/13397437> -- VARBINARY usage instead of VARCHAR explanation: <https://stackoverflow.com/a/13397437>
-- > War on varchar. Changed all occurrences of varchar(N) and varchar(N) binary to varbinary(N). varchars cause problems ("Invalid mix of collations" errors) on MySQL databases with certain configs, most notably the default MySQL config. -- > War on varchar. Changed all occurrences of varchar(N) and varchar(N) binary to varbinary(N). varchars cause problems ("Invalid mix of collations" errors) on MySQL databases with certain configs, most notably the default MySQL config.
CREATE TABLE `internal_links` ( CREATE TABLE `internal_links` (
`id` INT(8) UNSIGNED NOT NULL AUTO_INCREMENT, -- `id` INT(8) UNSIGNED NOT NULL AUTO_INCREMENT,
`from_page_id` INT(8) UNSIGNED NOT NULL, `from_page_id` INT(8) UNSIGNED NOT NULL,
`to_page_id` INT(8) UNSIGNED NOT NULL, `to_page_id` INT(8) UNSIGNED NOT NULL,
PRIMARY KEY (`id`) -- PRIMARY KEY (`id`)
-- PRIMARY KEY (`from_page_id`, `to_page_id`), PRIMARY KEY (`from_page_id`, `to_page_id`),
-- FOREIGN KEY (`from_page_id`) REFERENCES `pages` (`id`) ON DELETE CASCADE, FOREIGN KEY (`from_page_id`) REFERENCES `pages` (`id`) ON DELETE CASCADE,
-- FOREIGN KEY (`to_page_id`) REFERENCES `pages` (`id`) ON DELETE CASCADE FOREIGN KEY (`to_page_id`) REFERENCES `pages` (`id`) ON DELETE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED; ) ENGINE=MyISAM DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED;
SET @@session.unique_checks = 0;
SET @@session.foreign_key_checks = 0;
SET FOREIGN_KEY_CHECKS = 0;
SET UNIQUE_CHECKS = 0;

View File

@ -0,0 +1,11 @@
-- SET @@session.foreign_key_checks = 0;
-- SET FOREIGN_KEY_CHECKS = 0;
-- ALTER TABLE `internal_links` ADD CONSTRAINT fk_from_page_id FOREIGN KEY (`from_page_id`) REFERENCES `pages` (`id`);
-- ALTER TABLE `internal_links` ADD CONSTRAINT fk_to_page_id FOREIGN KEY (`to_page_id`) REFERENCES `pages` (`id`);
SET @@session.unique_checks = 1;
SET @@session.foreign_key_checks = 1;
SET FOREIGN_KEY_CHECKS = 1;
SET UNIQUE_CHECKS = 1;

View File

@ -74,3 +74,15 @@ export const swapKeysAndValues = (object) => {
}), }),
) )
} }
/**
*
* @param {number} number
* @param {number} places
* @returns {string}
* @example zeroPad(1, 2) // '01'
* @example zeroPad(10, 2) // '10'
*/
export const zeroPad = (number, places = 2) => {
return number.toString().padStart(places, "0")
}