From f53a797169f3a2925b08169bd4671be2102548ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20LUDWIG?= Date: Thu, 8 Aug 2024 02:21:53 +0100 Subject: [PATCH] feat: wikipedia data dump working --- .gitignore | 1 + TODO.md | 23 ++++++++++++++++------- data/compose.yaml | 1 + data/sql/0000-tables-create.sql | 10 ++++++++-- data/sql/0999-constraints.sql | 11 +++++++++++ 5 files changed, 37 insertions(+), 9 deletions(-) create mode 100644 data/sql/0999-constraints.sql diff --git a/.gitignore b/.gitignore index 658405c..5159e80 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,7 @@ cache.json data/dump data/sql/* !data/sql/0000-tables-create.sql +!data/sql/0999-constraints.sql # debug npm-debug.log* diff --git a/TODO.md b/TODO.md index af11a92..0a5c6fd 100644 --- a/TODO.md +++ b/TODO.md @@ -6,13 +6,22 @@ - [x] Download SQL files - [x] Extract SQL files - [x] Tables structure `CREATE TABLE` - - [x] `page.sql` (`pages` tables) - - [x] `pagelinks.sql` (`internal_links` tables) + - [x] `page.sql` (`pages` table) + - [x] `pagelinks.sql` (`internal_links` table) - [x] Adapt downloaded SQL files - - [x] `page.sql` (`pages` tables) - - [x] `pagelinks.sql` (`internal_links` tables) - - [ ] Import SQL files => Investigate why there is an error when importing - - [ ] Try `SELECT count(*) FROM internal_links il WHERE il.from_page_id = (SELECT p.id FROM pages p WHERE p.title = 'Linux'); -- Count of internal links for 'Linux' page` + - [x] `page.sql` (`pages` table) + - [x] `pagelinks.sql` (`internal_links` table) + - [x] Import SQL files + - [x] Try `SELECT count(*) FROM internal_links il WHERE il.from_page_id = (SELECT p.id FROM pages p WHERE p.title = 'Linux'); -- Count of internal links for 'Linux' page` + - [x] Try: + ```sql + SELECT il.to_page_id, pl.title + FROM internal_links il + JOIN pages pl ON pl.id = il.to_page_id + WHERE il.from_page_id = ( + SELECT p.id FROM pages p WHERE p.title = 'Node.js' + ); + ``` - [ ] Move from POC (Proof of concept) in `data` folder to `apps/cli` folder - [ ] Documentation how to use + Last execution date - [ ] Rewrite bash script to download and extract SQL files from Wikipedia Database Dump to Node.js for better cross-platform support and easier maintenance + automation, preferably one Node.js script to generate everything to create the database @@ -25,7 +34,7 @@ - [ ] Implement CLI (`cli`) - [ ] Add docs to add locale/edit translations, create component, install a dependency in a package, create a new package, technology used, architecture, links where it's deployed, how to use/install for end users, how to update dependencies with `npx taze -l` etc. - [ ] GitHub Mirror -- [ ] Delete `TODO.md` file and instead use issue for the remaining tasks +- [ ] Delete `TODO.md` file and instead use issues for the remaining tasks ## Links diff --git a/data/compose.yaml b/data/compose.yaml index a1c0b3e..6022fca 100644 --- a/data/compose.yaml +++ b/data/compose.yaml @@ -11,6 +11,7 @@ services: MARIADB_DATABASE: ${DATABASE_NAME} command: --innodb_buffer_pool_size=4G + --key-buffer-size=4G --innodb_log_buffer_size=256M --innodb_log_file_size=1G --innodb_write_io_threads=16 diff --git a/data/sql/0000-tables-create.sql b/data/sql/0000-tables-create.sql index 6454d64..01245fa 100644 --- a/data/sql/0000-tables-create.sql +++ b/data/sql/0000-tables-create.sql @@ -5,7 +5,7 @@ CREATE TABLE `pages` ( PRIMARY KEY (`id`), UNIQUE KEY (`title`) -) ENGINE=InnoDB AUTO_INCREMENT=76684425 DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED; +) ENGINE=MyISAM AUTO_INCREMENT=76684425 DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED; -- VARBINARY usage instead of VARCHAR explanation: -- > War on varchar. Changed all occurrences of varchar(N) and varchar(N) binary to varbinary(N). varchars cause problems ("Invalid mix of collations" errors) on MySQL databases with certain configs, most notably the default MySQL config. @@ -19,4 +19,10 @@ CREATE TABLE `internal_links` ( PRIMARY KEY (`from_page_id`, `to_page_id`), FOREIGN KEY (`from_page_id`) REFERENCES `pages` (`id`) ON DELETE CASCADE, FOREIGN KEY (`to_page_id`) REFERENCES `pages` (`id`) ON DELETE CASCADE -) ENGINE=InnoDB DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED; +) ENGINE=MyISAM DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED; + +SET @@session.unique_checks = 0; +SET @@session.foreign_key_checks = 0; + +SET FOREIGN_KEY_CHECKS = 0; +SET UNIQUE_CHECKS = 0; diff --git a/data/sql/0999-constraints.sql b/data/sql/0999-constraints.sql new file mode 100644 index 0000000..ce25e80 --- /dev/null +++ b/data/sql/0999-constraints.sql @@ -0,0 +1,11 @@ +-- SET @@session.foreign_key_checks = 0; +-- SET FOREIGN_KEY_CHECKS = 0; + +-- ALTER TABLE `internal_links` ADD CONSTRAINT fk_from_page_id FOREIGN KEY (`from_page_id`) REFERENCES `pages` (`id`); +-- ALTER TABLE `internal_links` ADD CONSTRAINT fk_to_page_id FOREIGN KEY (`to_page_id`) REFERENCES `pages` (`id`); + +SET @@session.unique_checks = 1; +SET @@session.foreign_key_checks = 1; + +SET FOREIGN_KEY_CHECKS = 1; +SET UNIQUE_CHECKS = 1;