diff --git a/.gitignore b/.gitignore index b5875ab..62337a4 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,8 @@ build/ .turbo bin/ cache.json +data/dump +data/sql/2-pages-inserts.sql # debug npm-debug.log* diff --git a/TODO.md b/TODO.md index 105dccd..c7974ed 100644 --- a/TODO.md +++ b/TODO.md @@ -2,11 +2,25 @@ - [x] chore: initial commit (+ mirror on GitHub) - [x] Deploy first staging version (v1.0.0-staging.1) +- [x] Wikipedia Database Dump + - [x] Download SQL files + - [x] Extract SQL files + - [x] Tables structure `CREATE TABLE` + - [x] `page.sql` (`pages` tables) + - [ ] `pagelinks.sql` (`internal_links` tables) + - [x] Adapt downloaded SQL files + - [x] `page.sql` (`pages` tables) + - [ ] `pagelinks.sql` (`internal_links` tables) + - [x] Import SQL files + - [ ] Verify file content up to before inserts, to check if it matches last version, and diff with last version + - [ ] Try `SELECT count(*) FROM internal_links il WHERE il.source_id = (SELECT p.id FROM pages WHERE p.title = 'Linux'); -- Count of internal links for 'Linux' page` + - [ ] Move from POC (Proof of concept) in `data` folder to `apps/cli` folder + - [ ] `.gitignore` correctly + Documentation how to use + Last execution date +- [ ] Implement REST API (`api`) with JSON responses ([AdonisJS](https://adonisjs.com/)) - [ ] Implement Wikipedia Game Solver (`website`) with inputs, button to submit, and list all pages to go from one to another, or none if it is not possible -- [ ] Check, cache and store (in `.json` file) all Wikipedia Pages and its internal links, maybe use Wikipedia Dump ()? +- [ ] Check how to deal with redirects (+ Wikipedia Database Dump related) - [ ] Implement toast notifications for errors, warnings, and success messages - [ ] Implement CLI (`cli`) -- [ ] Implement REST API (`api`) with JSON responses ([AdonisJS](https://adonisjs.com/)) - [ ] Add docs to add locale/edit translations, create component, install a dependency in a package, create a new package, technology used, architecture, links where it's deployed, how to use/install for end users, how to update dependencies with `npx taze -l` etc. ## Links diff --git a/data/.env.example b/data/.env.example new file mode 100644 index 0000000..83db903 --- /dev/null +++ b/data/.env.example @@ -0,0 +1,3 @@ +DATABASE_USER=wikipedia_user +DATABASE_PASSWORD=password +DATABASE_NAME=wikipedia diff --git a/data/.eslintrc.json b/data/.eslintrc.json new file mode 100644 index 0000000..42c084e --- /dev/null +++ b/data/.eslintrc.json @@ -0,0 +1,4 @@ +{ + "root": true, + "extends": ["@repo/eslint-config"] +} diff --git a/data/README.md b/data/README.md index 2509542..88d3c29 100644 --- a/data/README.md +++ b/data/README.md @@ -1,17 +1,127 @@ # Wikipedia data -Database layout: +## Utils + +Show the first 10 line of sql file: `head -n 10 ./dump/page.sql` + +To inspect volume size used by database: `docker system df -v | grep 'wikipedia-solver-postgres-data'` + +## PostgreSQL related + +### Import SQL file to PostgreSQL Docker Container + +In `compose.yaml`, we can specify SQL scripts to be executed when the container starts for the first time. + +```yaml +volumes: + - "./sql:/docker-entrypoint-initdb.d/" +``` + +### Remove a volume + +```sh +# List all volumes +docker volume ls + +# Remove a volume +docker volume rm data_wikipedia-solver-postgres-data +``` + +## MySQL Related - +MySQL any way to import a huge (32 GB) sql dump faster?: + +Import data.sql MySQL Docker Container: + + ## Dumps Links +- Database layout: +- - -- -- -- -- -- +## `page.sql.gz` + +### MySQL full version + +```sql +-- MariaDB dump 10.19 Distrib 10.5.23-MariaDB, for debian-linux-gnu (x86_64) +-- +-- Host: db1206 Database: enwiki +-- ------------------------------------------------------ +-- Server version 10.6.17-MariaDB-log + +/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; +/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; +/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; +/*!40101 SET NAMES utf8mb4 */; +/*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; +/*!40103 SET TIME_ZONE='+00:00' */; +/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */; +/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; +/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; +/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; + +-- +-- Table structure for table `page` +-- + +DROP TABLE IF EXISTS `page`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!40101 SET character_set_client = utf8 */; +CREATE TABLE `page` ( + `page_id` int(8) unsigned NOT NULL AUTO_INCREMENT, + `page_namespace` int(11) NOT NULL DEFAULT 0, + `page_title` varbinary(255) NOT NULL DEFAULT '', + `page_is_redirect` tinyint(1) unsigned NOT NULL DEFAULT 0, + `page_is_new` tinyint(1) unsigned NOT NULL DEFAULT 0, + `page_random` double unsigned NOT NULL DEFAULT 0, + `page_touched` binary(14) NOT NULL, + `page_links_updated` varbinary(14) DEFAULT NULL, + `page_latest` int(8) unsigned NOT NULL DEFAULT 0, + `page_len` int(8) unsigned NOT NULL DEFAULT 0, + `page_content_model` varbinary(32) DEFAULT NULL, + `page_lang` varbinary(35) DEFAULT NULL, + PRIMARY KEY (`page_id`), + UNIQUE KEY `page_name_title` (`page_namespace`,`page_title`), + KEY `page_random` (`page_random`), + KEY `page_len` (`page_len`), + KEY `page_redirect_namespace_len` (`page_is_redirect`,`page_namespace`,`page_len`) +) ENGINE=InnoDB AUTO_INCREMENT=77490241 DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `page` +-- + +/*!40000 ALTER TABLE `page` DISABLE KEYS */; +INSERT INTO `page` VALUES (10,0,'AccessibleComputing',1,0,0.856935107283,'20240722211426','20240722220435',1219062925,111,'wikitext',NULL); + +INSERT INTO `page` VALUES (10,0,'AccessibleComputing',1,0,0.856935107283,'20240722211426','20240722220435',1219062925,111,'wikitext',NULL),(12,0,'Anarchism',0,0,0.786172332974311,'20240731234111','20240731234202',1234495258,110759,'wikitext',NULL),(12281,0,'Gottfried_Wilhelm_Leibniz',0,0,0.79151204115852,'20240731234133','20240731234228',1237687724,155319,'wikitext',NULL),(13,0,'AfghanistanHistory',1,0,0.154661929211,'20240729123940','20240722220436',783865149,90,'wikitext',NULL),(14,0,'AfghanistanGeography',1,0,0.952234464653055,'20240722211426','20240722220436',783865160,92,'wikitext',NULL),(15,0,'AfghanistanPeople',1,0,0.047716566551,'20240722211426','20240722220436',783865293,95,'wikitext',NULL),(12473,1,'Gnosticism',0,0,0.00653186720472934,'20240801075011','20240731232236',1233717868,6579,'wikitext',NULL); +-- Expected output: INSERT INTO `page` VALUES (12,'Anarchism'),(12281,'Gottfried_Wilhelm_Leibniz'); +``` + +### PostgreSQL short version + +```sql +CREATE TABLE IF NOT EXISTS pages ( + id BIGSERIAL PRIMARY KEY, + title VARCHAR(255) UNIQUE NOT NULL + + -- is_redirect BOOLEAN NOT NULL DEFAULT FALSE +); + +-- Examples of inserts +INSERT INTO pages VALUES (10, 'AccessibleComputing'); -- (is_redirect = true) +INSERT INTO pages VALUES (10474, 'Eight_queens_puzzle'); -- (is_redirect = false) + +INSERT INTO pages VALUES +(10,'AccessibleComputing'), +(12,'Anarchism'), +(13,'AfghanistanHistory'), +(14,'AfghanistanGeography'), +(15,'AfghanistanPeople'); +``` diff --git a/data/adminer/default-orange.css b/data/adminer/default-orange.css new file mode 100644 index 0000000..4b7f66b --- /dev/null +++ b/data/adminer/default-orange.css @@ -0,0 +1,1376 @@ +a, +abbr, +acronym, +address, +applet, +aside, +b, +big, +blockquote, +body, +caption, +center, +cite, +code, +dd, +del, +dfn, +div, +dl, +dt, +em, +fieldset, +font, +footer, +form, +h1, +h2, +h3, +h4, +h5, +h6, +header, +html, +i, +iframe, +img, +ins, +kbd, +label, +legend, +li, +menu, +object, +ol, +p, +pre, +q, +s, +samp, +small, +span, +strike, +strong, +sub, +sup, +table, +tbody, +td, +tfoot, +th, +thead, +tr, +tt, +u, +ul, +var { + margin: 0; + padding: 0; + outline: 0; + border: none; + background: 0 0; + font-size: 10pt; + font-weight: 400; +} + +ol, +ul { + list-style: none; +} + +blockquote, +q { + quotes: none; +} + +blockquote:after, +blockquote:before, +q:after, +q:before { + content: none; +} + +:focus { + outline: 0; +} + +ins { + text-decoration: none; +} + +del { + text-decoration: line-through; +} + +table { + border-collapse: collapse; + border-spacing: 0; +} + +aside, +menu { + display: block; +} + +input[type="submit"], +input[type="checkbox"], +input[type="radio"], +input[type="file"], +label, +select { + cursor: pointer; +} + +input[disabled=""] { + opacity: 0.5; + cursor: not-allowed; + color: #666 !important; + border-color: #aaa !important; +} + +input[type="text"] { + -webkit-user-modify: read-write-plaintext-only; +} + +@font-face { + font-family: entypo; + src: url(../fonts/entypo.eot); + src: + url(../fonts/entypo.eot?#iefix) format("embedded-opentype"), + url(../fonts/entypo.woff) format("woff"), + url(../fonts/entypo.ttf) format("truetype"), + url(../fonts/entypo.svg#entypo) format("svg"); + font-weight: 400; + font-style: normal; +} + +html { + overflow-y: scroll; + -webkit-text-size-adjust: none; +} + +body { + font-family: "Helvetica Neue", Helvetica, Verdana, Arial, sans-serif; + background: #fff; + color: #444; +} + +a, +a:visited { + padding: 4px 0; + color: #d55d00; + transition: + color 0.1s ease 0s, + background-color 0.1s ease 0s; +} + +a:link:hover, +a:visited:hover { + color: #aa4a00; + text-decoration: none; +} + +a sup { + padding: 0 5px; +} + +#logins a, +#tables a, +#tables span { + background: inherit; +} + +.active:before { + font-weight: 400; +} + +label { + padding: 3px 10px; +} + +input::-webkit-input-placeholder { + color: #999; +} + +input::-moz-placeholder { + color: #999; +} + +input:-ms-input-placeholder { + color: #999; +} + +input:not([type]), +input[type="text"], +input[type="email"], +input[type="password"], +input[type="search"], +input[type="number"], +pre[contenteditable="true"], +select, +textarea { + padding: 4px 5px !important; + border: 1px solid #ccc !important; + border-radius: 2px; + font-size: 10pt; + background: #fff; + color: #444; + box-shadow: inset 0 2px 2px #ebebeb; + -moz-box-sizing: border-box; + -webkit-box-sizing: border-box; + box-sizing: border-box; +} + +input:not([type]), +input[type="text"], +input[type="email"], +input[type="password"], +input[type="search"], +input[type="number"], +textarea { + -webkit-appearance: none; +} + +input:not([type]), +input[type="text"], +input[type="email"], +input[type="password"], +input[type="search"], +input[type="number"], +select { + height: 28px; +} + +input[type="submit"] { + display: inline-block; + padding: 7px 15px; + border: 1px solid #d55d00; + border-radius: 2px; + background: #d55d00; + color: #fff; + text-align: center; + text-decoration: none; + font-size: 10pt; + transition: background-color 0.1s ease 0s; + -webkit-appearance: none; +} + +input[type="submit"]:hover { + background: #aa4a00; + border-color: #aa4a00; +} + +input[type="submit"][disabled=""]:hover { + background: #fff; +} + +input[type="submit"].default { + box-shadow: none; +} + +input[type="image"] { + border: 4px solid #fff; + outline: 1px solid #f2ceb3; + -moz-outline-radius: 2px; + margin-right: 5px; +} + +input[type="image"]:last-child { + margin-right: 0; +} + +input[type="image"]:hover { + border-color: #fbefe6; +} + +input[type="checkbox"], +input[type="radio"] { + margin: 7px 5px 7px 0; +} + +fieldset { + margin: 5px 5px 10px 0; + padding: 5px 10px; + border: 1px solid #ddd; + border-radius: 2px; + background: #f6f6f6; + min-height: 55px; +} + +fieldset input[type="submit"] { + padding: 3px 10px; + border-color: #e69e66; + background: #fff; + color: #d55d00; +} + +fieldset input[type="submit"]:hover { + background: #fbefe6; + color: #d55d00; +} + +fieldset input[type="submit"].default { + border-color: #d55d00; + background: #d55d00; + color: #fff; +} + +fieldset input[type="submit"].default:hover { + background: #aa4a00; + border-color: #aa4a00; +} + +fieldset + table, +table + fieldset { + margin-top: 10px; +} + +fieldset legend a { + position: relative; + padding-bottom: 50px; +} + +fieldset > div > a, +fieldset > div > code, +fieldset > div > div, +fieldset > div > input, +fieldset > div > p, +fieldset > div > select { + position: relative; +} + +legend { + margin-bottom: 3px; +} + +fieldset input, +fieldset select, +p input, +p label, +p select { + margin: 0 5px 5px 0; +} + +.js fieldset > .hidden { + display: block; + margin-top: 5px; + text-align: center; +} + +.js fieldset > .hidden * { + display: none !important; +} + +.js fieldset > .hidden:before { + content: "โถ"; + font-family: entypo, sans-serif; + font-size: 40pt; + line-height: 0; + vertical-align: middle; + color: #e2e2e2; +} + +#fieldset-select.hidden:before { + content: "โš"; +} + +#fieldset-search.hidden:before { + content: "๐Ÿ”"; +} + +#fieldset-sort.hidden:before { + content: "โท"; +} + +#fieldset-export.hidden:before { + content: "๐Ÿ“ค"; +} + +#fieldset-import.hidden:before { + content: "๐Ÿ“ฅ"; +} + +#fieldset-history.hidden:before { + content: "๎ €"; +} + +#fieldset-history br { + display: block; + margin-bottom: 20px; +} + +#fieldset-history.hidden br { + display: none; +} + +#fieldset-partition.hidden:before { + content: "๎œฃ"; +} + +.size { + width: 8ex; +} + +.sqlarea { + max-width: 100%; + width: auto !important; + height: 350px !important; +} + +@media only screen and (max-width: 768px) { + input:not([type]), + input[type="text"], + input[type="email"], + input[type="password"], + input[type="search"], + input[type="number"], + pre[contenteditable="true"], + select, + textarea { + font-size: 12pt; + vertical-align: -1px; + } + + input:not([type]), + input[type="text"], + input[type="email"], + input[type="password"], + input[type="search"], + input[type="number"], + select { + height: 32px; + } + + fieldset input[type="submit"] { + padding: 6px 15px; + } + + .sqlarea { + height: 250px !important; + } +} + +@media only screen and (max-width: 360px) { + input:not([type]), + input[type="text"], + input[type="email"], + input[type="password"], + input[type="search"], + input[type="number"], + pre[contenteditable="true"], + select, + textarea { + width: 100%; + } + + fieldset input[type="submit"], + input[type="submit"] { + padding-left: 10px; + padding-right: 10px; + } +} + +#lang { + position: fixed; + right: 0; + top: 0; + left: auto; + border: none; + padding: 0 0 0 10px; + width: 190px; + height: 40px; + line-height: 30px; + font-size: 0; + z-index: 101; + background: #f6f6f6; +} + +#lang select { + padding: 2px 3px; + margin: 6px 0; + width: 100px; +} + +.logout { + position: fixed; + right: 10px; + margin: 0; + z-index: 101; + overflow: hidden; +} + +.logout input[type="submit"] { + border: none; + margin: 0; + padding: 0 10px; + height: 40px; + background: 0 0; + color: #d55d00; +} + +.logout input[type="submit"]:hover { + background: 0 0; + color: #aa4a00; +} + +@media only screen and (max-width: 768px) { + #lang { + position: static; + left: 0; + top: 0; + width: auto; + border-top: 1px solid #ddd; + background: #f6f6f6; + } + + #lang select { + margin: 4px 10px 0 10px; + } + + .logout { + position: relative; + float: right; + margin-top: -40px; + } +} + +@media only screen and (max-width: 360px) { + #lang select { + margin-left: 0; + } +} + +#content { + position: relative; + margin: 0 0 0 261px; + padding: 41px 20px 80px 20px; +} + +#content:before { + position: fixed; + left: 0; + top: 0; + content: ""; + display: block; + width: 100%; + height: 40px; + background: #f6f6f6; + border-bottom: 1px solid #ddd; +} + +#content .links + p { + color: #999; +} + +#breadcrumb { + position: fixed; + left: 261px; + top: 0; + right: 0; + margin: 0; + padding: 0 0 0 20px; + border-right: 205px solid #f6f6f6; + background: #f6f6f6; + height: 40px; + line-height: 40px; + z-index: 100; + overflow: hidden; + text-overflow: ellipsis; +} + +#breadcrumb a { + display: inline-block; + padding: 0; + height: 40px; + line-height: 40px; +} + +h2 { + margin: 20px 0; + font-size: 20pt; + color: #444; +} + +h3 { + margin: 30px 0 10px 0; + font-size: 16pt; +} + +p { + margin: 10px 0; +} + +code { + display: block; + padding: 10px; + margin: 5px 0; + border-left: 7px solid #cde; + border-radius: 2px; + background: #e8f0fa; + overflow: auto; +} + +fieldset code:first-child, +td code, +th code { + display: inline; + margin: 0; + padding: 0; + background: 0 0; + border: none; +} + +pre code { + margin: 0; +} + +fieldset code + i { + display: none; +} + +.time { + margin-left: 10px; + float: right; + font-size: 8pt; + color: #bbb; +} + +.error, +.message { + margin: 20px 0; + padding: 10px; + border-left: 7px solid #cec; +} + +.error { + color: #900; + border-color: #ecc; +} + +.message pre { + margin: 15px 0 5px 0; +} + +.message p { + margin: 0 0 5px 0; +} + +pre + .error, +pre + .message { + margin-top: 0; +} + +#help { + z-index: 200; + border: 1px solid #ddd; + border-radius: 2px; + background: #f6f6f6; + padding: 5px 7px; +} + +.icon { + background-color: #d55d00; +} + +.icon:hover { + background-color: #aa4a00; +} + +@media only screen and (max-width: 768px) { + #content { + margin-left: 0; + } + + #breadcrumb { + left: 0; + padding-left: 50px; + border-right-width: 0; + } +} + +@media only screen and (max-width: 360px) { + #content { + padding: 41px 10px 20px; + } + + h2 { + margin: 15px 0; + font-size: 16pt; + } +} + +h1 { + height: 40px; + white-space: nowrap; + overflow: hidden; +} + +h1 #h1 { + display: inline-block; + padding: 0; + background: url(logo.png) 10px center no-repeat; + background-size: 120px; + text-indent: -100px; + width: 135px; + height: 40px; +} + +#version, +.version { + position: relative; + top: -7px; + vertical-align: bottom; + font-size: 8pt; + font-style: italic; + color: #bbb; +} + +#version { + padding: 5px; + color: #d55d00; +} + +#version:hover { + color: #aa4a00; +} + +#menu { + position: fixed; + left: 0; + top: 0; + bottom: 0; + width: 260px; + margin: 0; + padding: 0; + border-right: 1px solid #ddd; + overflow-y: auto; + overflow-x: hidden; + -webkit-overflow-scrolling: touch; + background: #fff; + z-index: 100; +} + +#menu #dbs { + border-top: 1px solid #ddd; + border-bottom: none; + padding: 10px; + background: #f6f6f6; + color: #f6f6f6; + font-size: 0; +} + +#menu #dbs span { + font-size: 0; +} + +#menu #dbs select { + margin: 0; + width: 100%; +} + +#menu .links { + background: #f6f6f6; + border-bottom: 1px solid #ddd; + padding: 0 10px 7px 10px; +} + +#menu #logins, +#menu #tables { + border-bottom: 1px solid #ddd; + padding: 0; + margin-bottom: 25px; +} + +#menu .message { + background: 0 0; + border: none; + border-bottom: 1px solid #ddd; + color: #bbb; +} + +.menu-link { + display: block; + padding: 2px 10px; + width: auto; + height: 20px; + line-height: 20px; + color: #444; + overflow: hidden; + text-overflow: ellipsis; +} + +.menu-link.active { + color: #d55d00; + font-weight: 700; +} + +#tables a[href*="&table="] { + display: block; + padding: 2px 10px; + width: auto; + height: 20px; + line-height: 20px; + color: #444; + overflow: hidden; + text-overflow: ellipsis; + padding-right: 0; +} + +#tables a[href*="&table="].active { + color: #d55d00; + font-weight: 700; +} + +#tables a[href*="&table="]:hover { + background: #eee; +} + +#tables a[href*="&select="] { + float: right; + display: block; + padding: 2px 7px; + height: 20px; + line-height: 20px; + color: #999 !important; + overflow: hidden; + width: 16px; +} + +#tables a[href*="&select="]:hover { + color: #444 !important; +} + +#tables a[href*="&select="]:before { + content: "๐Ÿ“„ย ย "; + font-family: entypo, sans-serif; + font-size: 24pt; + line-height: 0; + vertical-align: -3px; +} + +#tables li:first-of-type a { + padding-top: 7px; +} + +#tables li:last-child a { + padding-bottom: 7px; +} + +#tables a.active + a { + color: #d55d00; + font-weight: 700; +} + +#tables br { + display: none; +} + +#tables.simple a { + display: block; + padding: 2px 10px; + width: auto; + height: 20px; + line-height: 20px; + color: #444; + overflow: hidden; + text-overflow: ellipsis; +} + +#tables.simple a.active { + color: #d55d00; + font-weight: 700; +} + +#tables.simple a[href*="&select="] { + display: block; + padding: 2px 10px; + width: auto; + height: 20px; + line-height: 20px; + color: #444; + overflow: hidden; + text-overflow: ellipsis; + float: none; + color: #444 !important; +} + +#tables.simple a[href*="&select="].active { + color: #d55d00; + font-weight: 700; +} + +#tables.simple a[href*="&select="].active { + color: #d55d00 !important; +} + +#tables.simple a[href*="&select="]:before { + content: ""; +} + +#tables.simple li:hover { + background: #eee; +} + +#tables.simple li:first-child a { + padding-top: 7px; +} + +#tables.simple li:last-child a { + padding-bottom: 7px; +} + +#logins { + border-top: 1px solid #ddd; +} + +#logins a { + display: block; + padding: 2px 10px; + width: auto; + height: 20px; + line-height: 20px; + color: #444; + overflow: hidden; + text-overflow: ellipsis; +} + +#logins a.active { + color: #d55d00; + font-weight: 700; +} + +#logins a:hover { + background: #eee; +} + +#logins a:first-of-type { + padding-top: 7px; +} + +#logins a:last-of-type { + padding-bottom: 7px; +} + +#logins br { + display: none; +} + +@media only screen and (max-width: 768px) { + h1:before { + float: left; + position: relative; + left: 4px; + top: 4px; + width: 30px; + height: 30px; + content: "โ˜ฐ"; + font-family: entypo, sans-serif; + font-size: 32pt; + line-height: 30px; + border: 1px solid #e69e66; + border-radius: 2px; + text-align: center; + vertical-align: middle; + background: #fff; + cursor: pointer; + } + + h1 #h1 { + margin-left: 10px; + } + + #menu { + width: 40px; + height: 40px; + bottom: auto; + border: none; + overflow: hidden; + background: 0 0; + } + + #menu form, + #menu p { + display: none; + } + + #menu.open { + width: 260px; + height: auto; + max-width: 100%; + max-height: 100%; + border-right: 1px solid #ddd; + border-bottom: 5px solid #ddd; + background: #fff; + box-shadow: 2px 2px 10px rgba(0, 0, 0, 0.03); + z-index: 200; + overflow-y: auto; + } + + #menu.open form, + #menu.open p { + display: block; + } +} + +@media only screen and (max-width: 270px) { + #menu.open { + border-right: none; + } +} + +@media only screen and (-webkit-min-device-pixel-ratio: 1.5), + only screen and (min--moz-device-pixel-ratio: 1.5), + only screen and (-o-min-device-pixel-ratio: 3/2), + only screen and (min-device-pixel-ratio: 1.5) { + h1 #h1 { + background-image: url(../images/logo-hres.png?3); + background-size: 120px; + } +} + +a[href*="&sql="]:before { + content: "โœŽ"; + padding: 0 5px; + font-family: entypo, sans-serif; + font-size: 24pt; + line-height: 10pt; + vertical-align: -3px; +} + +.links { + line-height: 22px; +} + +.links a:before { + content: "โดย "; + font-family: entypo, sans-serif; + font-size: 24pt; + line-height: 10pt; + vertical-align: -3px; +} + +.links a[href*="&sql="]:before { + content: "๎œท"; + margin-left: -4px; + margin-right: 3px; +} + +.links a[href*="&import="]:before { + content: "๐Ÿ“ฅย "; +} + +.links a[href*="&dump="]:before { + content: "๐Ÿ“คย "; +} + +.links a[href*="&create="]:before, +.links a[href*="&db="][href*="&database="]:before, +.links a[href*="&indexes="]:before { + content: "โœŽย "; +} + +.links a[href$="&create="]:before, +.links a[href$="&database="]:before, +.links a[href$="&indexes="]:before { + content: "โž•ย "; +} + +.links a[href*="&schema="]:before { + content: "๐Ÿ•ชย "; +} + +.links a[href*="&privileges="]:before { + content: "๐Ÿ‘ฅย "; +} + +.links a[href*="&view="]:before { + content: "๎œŠย "; +} + +.links a[href*="&procedure="]:before, +.links a[href*="&function="]:before { + content: "๎€ƒย "; +} + +.links a[href*="&event="]:before { + content: "๐Ÿ”ย "; +} + +.links a[href*="&edit="]:before { + content: "โŠ•ย "; +} + +.links a[href*="&table="]:before { + content: "โš™ย "; +} + +.links a[href*="&select="]:before { + content: "๐Ÿ“„ย "; +} + +.links a[href*="&processlist="]:before { + content: "๎€…ย "; +} + +.links a[href*="&status="]:before { + content: "๐Ÿ“ฟย "; +} + +.links a[href*="&variables="]:before { + content: "๎œ”ย "; +} + +.links a[href*="&user="]:before { + content: "๎œ€ย "; +} + +.links a[href*="&foreign="]:before, +.links a[href*="&trigger="]:before { + content: "โž•ย "; +} + +table { + border: 1px solid #ddd; + margin: 20px 0 10px 0; +} + +table label.block { + padding: 0; +} + +tr { + border-bottom: 1px dotted #ddd; +} + +td, +th { + padding: 4px 10px; +} + +td[align="right"] input[type="checkbox"], +td[align="right"] input[type="radio"], +th[style="text-align: right;"] input[type="checkbox"], +th[style="text-align: right;"] input[type="radio"] { + margin-right: 0; + margin-left: 5px; +} + +.js .checkable .checked td, +.js .checkable .checked th, +.odd td, +tbody tr:hover td, +tbody tr:hover th, +thead td, +thead th { + background: 0 0; +} + +thead tr { + background: #f6f6f6; + border-bottom: 1px solid #ddd; +} + +thead td, +thead th { + padding: 7px 10px; + background: 0 0; + text-align: left; +} + +tbody td, +tbody th { + vertical-align: top; +} + +tbody td[align="right"] { + text-align: right; +} + +tbody td[align="right"] label.block { + text-align: right; +} + +tbody th span { + padding-top: 4px; +} + +table.checkable .checked { + background: #fbefe6; +} + +table.checkable input[type="checkbox"], +table.checkable input[type="radio"] { + margin: 2px 5px 2px 0; +} + +table.checkable > thead a { + padding: 7px 0; +} + +table.checkable > thead input[type="checkbox"], +table.checkable > thead input[type="radio"] { + margin: 2px 5px 2px 0; +} + +table.checkable > tbody > tr:hover { + background: #f5f5f5; +} + +table.checkable > tbody > tr.checked:hover { + background: #f8e4d4; +} + +.footer { + position: relative; + padding: 0; +} + +.footer > p { + position: fixed; + left: 261px; + right: 0; + bottom: 0; + margin: 0; + padding: 0 10px; + border: none; + border-top: 1px solid #ddd; + background: #f6f6f6; + z-index: 102; + font-weight: 700; +} + +.footer > p a, +.footer > p label { + display: inline-block; + margin: 0; + padding: 0 10px; + height: 40px; + line-height: 40px; +} + +.js .column { + background: #fff; + padding: 0; + margin: -36px 0 0 -62px; + border: 1px solid #e49254; + border-radius: 2px; + z-index: 10; +} + +.js .column a { + display: inline-block; + padding: 0; + width: 30px; + height: 30px; + overflow: hidden; + vertical-align: middle; +} + +.js .column a:before { + display: inline-block; + width: 30px; + height: 30px; + line-height: 30px; + font-family: entypo, sans-serif; + font-size: 24pt; + text-align: center; + vertical-align: -3px; +} + +.js .column a:hover:before { + background: #fbefe6; +} + +.js .column a[href*="&select="]:before { + content: "โฌ‡"; +} + +.js .column a[href="#fieldset-search"]:before { + content: "๐Ÿ”"; +} + +@media only screen and (max-width: 768px) { + .footer > p { + position: static; + margin: -10px 0 10px 0; + border-top: none; + border-left: 1px solid #ddd; + border-right: 1px solid #ddd; + border-bottom: 1px solid #ddd; + } +} + +a.jush-custom:hover, +a.jush-help:hover { + color: inherit; + text-decoration: underline; +} + +.json { + border-color: #cde; + border-left: 7px solid #cde; + background: #e8f0fa; + margin: 5px 0 3px 0; +} + +.json tr { + border-bottom: 1px solid #cde; +} + +.json tr:last-child { + border-bottom: none; +} + +.json th { + border-right: 1px solid #cde; + vertical-align: top; +} + +.json code { + padding: 4px 10px; +} + +.json + textarea { + margin-top: 6px; +} + +a.json-icon { + background: 0 0; + text-indent: 0; +} + +a.json-icon:hover { + background: 0 0; +} + +a.json-icon:before { + display: inline-block; + width: 20px; + height: 18px; + line-height: 18px; + font-family: entypo, sans-serif; + font-size: 24pt; + vertical-align: -3px; + content: "โ–ธ"; +} + +a.json-icon.json-up { + background: 0 0; + text-indent: 0; +} + +a.json-icon.json-up:before { + content: "โ–พ"; +} + +a.json-link { + padding-left: 0; +} + +a.json-link:before { + width: 10px; +} + +a.json-link span { + color: inherit; +} diff --git a/data/adminer/fonts/entypo.eot b/data/adminer/fonts/entypo.eot new file mode 100644 index 0000000..1940b6d Binary files /dev/null and b/data/adminer/fonts/entypo.eot differ diff --git a/data/adminer/fonts/entypo.svg b/data/adminer/fonts/entypo.svg new file mode 100644 index 0000000..a02386a --- /dev/null +++ b/data/adminer/fonts/entypo.svg @@ -0,0 +1,264 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/data/adminer/fonts/entypo.ttf b/data/adminer/fonts/entypo.ttf new file mode 100644 index 0000000..539ff2b Binary files /dev/null and b/data/adminer/fonts/entypo.ttf differ diff --git a/data/adminer/fonts/entypo.woff b/data/adminer/fonts/entypo.woff new file mode 100644 index 0000000..c9b908c Binary files /dev/null and b/data/adminer/fonts/entypo.woff differ diff --git a/data/adminer/logo.png b/data/adminer/logo.png new file mode 100644 index 0000000..9956bba Binary files /dev/null and b/data/adminer/logo.png differ diff --git a/data/compose.yaml b/data/compose.yaml new file mode 100644 index 0000000..8e728b0 --- /dev/null +++ b/data/compose.yaml @@ -0,0 +1,71 @@ +services: + # wikipedia-solver-database: + # container_name: "wikipedia-solver-database" + # image: "postgres:16.3" + # restart: "unless-stopped" + # env_file: ".env" + # environment: + # POSTGRES_USER: ${DATABASE_USER} + # POSTGRES_PASSWORD: ${DATABASE_PASSWORD} + # POSTGRES_DB: ${DATABASE_NAME} + # volumes: + # - "wikipedia-solver-postgres-data:/var/lib/postgresql/data" + # - "./sql:/docker-entrypoint-initdb.d/" + + wikipedia-solver-database: + container_name: "wikipedia-solver-database" + image: "mariadb:10.6.17" + restart: "unless-stopped" + env_file: ".env" + environment: + MARIADB_USER: ${DATABASE_USER} + MARIADB_PASSWORD: ${DATABASE_PASSWORD} + MARIADB_ROOT_PASSWORD: ${DATABASE_PASSWORD} + MARIADB_DATABASE: ${DATABASE_NAME} + command: + --innodb_buffer_pool_size=4G + --innodb_log_buffer_size=256M + --innodb_log_file_size=1G + --innodb_write_io_threads=16 + --innodb_flush_log_at_trx_commit=0 + --max_allowed_packet=1G + volumes: + - "wikipedia-solver-mariadb-data:/var/lib/mysql" + - "./sql:/docker-entrypoint-initdb.d/" + + adminer: + container_name: "adminer" + image: "adminer:4.8.1" + restart: "unless-stopped" + ports: + - "8080:8080" + env_file: ".env" + environment: + ADMINER_DEFAULT_SERVER: "wikipedia-solver-database" + volumes: + - "./adminer/default-orange.css:/var/www/html/adminer.css" + - "./adminer/logo.png:/var/www/html/logo.png" + - "./adminer/fonts/:/var/www/html/fonts" + + # dbgate: + # image: "dbgate/dbgate:5.3.3" + # restart: "always" + # ports: + # - "8080:3000" + # volumes: + # - "dbgate-data:/root/.dbgate" + # environment: + # CONNECTIONS: "con1" + + # LABEL_con1: "Postgres" + # SERVER_con1: "wikipedia-solver-database" + # USER_con1: ${DATABASE_USER} + # PASSWORD_con1: ${DATABASE_PASSWORD} + # PORT_con1: 5432 + # ENGINE_con1: "postgres@dbgate-plugin-postgres" + +volumes: + wikipedia-solver-mariadb-data: + # wikipedia-solver-postgres-data: + # dbgate-data: + # driver: "local" diff --git a/data/database-wikipedia-v2.js b/data/database-wikipedia-v2.js new file mode 100644 index 0000000..bcb63a2 --- /dev/null +++ b/data/database-wikipedia-v2.js @@ -0,0 +1,113 @@ +import fs from "node:fs" +import path from "node:path" +import { extractRowsFromSQLValues } from "./utils.js" + +const SQL_DUMP_PATH = path.join(process.cwd(), "dump") +const SQL_OUTPUT_PATH = path.join(process.cwd(), "sql") + +/** + * @typedef {Record} WikipediaPages + * + * Object to store pages from Wikipedia: + * - Key: page title sanitized - The real title shown is this title with underscores (_) converted to spaces ( ). + * - Value: page id. + */ + +/** + * Function to clean the `page.sql` file by: + * - Removing all lines that don't start with `INSERT INTO...`. + * - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to false (0). + * - Only keep columns `page_id` (1st column) and `page_title` (3rd column). + * @returns {Promise} + */ +const cleanPagesSQL = async () => { + /** @type {WikipediaPages} */ + const wikipediaPages = {} + + const INSERT_INTO_START_INPUT = "INSERT INTO `page` VALUES " + const sqlInputPath = path.join(SQL_DUMP_PATH, "page.sql") + const sqlInputStat = await fs.promises.stat(sqlInputPath) + const sqlInputFileStream = fs.createReadStream(sqlInputPath, "utf-8") + + let isInsideInsert = false + let current = "" + let lastPercent = 0 + + return await new Promise((resolve, reject) => { + sqlInputFileStream + .on("data", (dataInput) => { + const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size + const bytesReadPercent = bytesReadRatio * 100 + + if (bytesReadPercent - lastPercent >= 1) { + console.log( + `Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`, + ) + lastPercent = bytesReadPercent + } + + let data = current + dataInput + + if (!isInsideInsert) { + const lines = data.split("\n").filter((line) => { + return line.startsWith(INSERT_INTO_START_INPUT) + }) + const [line] = lines + if (line == null) { + sqlInputFileStream.close() + return reject(new Error(`No "${INSERT_INTO_START_INPUT}" found.`)) + } + isInsideInsert = true + const lineStripped = line.slice(INSERT_INTO_START_INPUT.length) + data = lineStripped + } + + const { rows, unCompleted } = extractRowsFromSQLValues(data) + current = unCompleted + + for (const row of rows) { + if (row.length !== 12) { + sqlInputFileStream.close() + console.error([row]) + return reject(new Error(`Invalid Row values.`)) + } + + const id = Number.parseInt(row[0] ?? "0", 10) + const namespace = row[1] ?? "" + const title = row[2] ?? "" + const isRedirect = row[3] === "1" + + if (namespace === "0" && !isRedirect) { + wikipediaPages[title] = id + } + } + }) + .on("error", (error) => { + return reject(error) + }) + .on("close", () => { + return resolve(wikipediaPages) + }) + }) +} + +const wikipediaPages = await cleanPagesSQL() + +const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "2-pages-inserts.sql") +const INSERT_INTO_START_OUTPUT = "INSERT INTO pages VALUES " + +const wikipediaPagesString = Object.entries(wikipediaPages) + .map(([title, id]) => { + return `(${id},${title})` + }) + .join(",") + +await fs.promises.writeFile( + sqlOutputPath, + `${INSERT_INTO_START_OUTPUT}${wikipediaPagesString};`, + { encoding: "utf-8" }, +) + +// const sqlOutputFile = await fs.promises.open(sqlOutputPath, "w") +// await sqlOutputFile.appendFile(INSERT_INTO_START_OUTPUT) +// await sqlOutputFile.appendFile(`(${id},${title}),\n`, "utf-8") diff --git a/data/database-wikipedia.js b/data/database-wikipedia.js new file mode 100644 index 0000000..a41fa5c --- /dev/null +++ b/data/database-wikipedia.js @@ -0,0 +1,94 @@ +import fs from "node:fs" +import path from "node:path" +import { extractRowsFromSQLValues } from "./utils.js" + +const SQL_DUMP_PATH = path.join(process.cwd(), "dump") +const SQL_OUTPUT_PATH = path.join(process.cwd(), "sql") + +/** + * Function to clean the `page.sql` file by: + * - Removing all lines that don't start with `INSERT INTO...`. + * - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to false (0). + * - Only keep columns `page_id` (1st column) and `page_title` (3rd column). + */ +const cleanPagesSQL = async () => { + const INSERT_INTO_START_INPUT = "INSERT INTO `page` VALUES " + const INSERT_INTO_START_OUTPUT = "INSERT INTO pages VALUES\n" + const sqlInputPath = path.join(SQL_DUMP_PATH, "page.sql") + const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "2-pages-inserts.sql") + + const sqlInputStat = await fs.promises.stat(sqlInputPath) + const sqlInputFileStream = fs.createReadStream(sqlInputPath, "utf-8") + const sqlOutputFile = await fs.promises.open(sqlOutputPath, "w") + await sqlOutputFile.appendFile(INSERT_INTO_START_OUTPUT) + + let isInsideInsert = false + let current = "" + let lastPercent = 0 + + return await new Promise((resolve, reject) => { + sqlInputFileStream + .on("data", async (dataInput) => { + const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size + const bytesReadPercent = bytesReadRatio * 100 + + if (bytesReadPercent - lastPercent >= 1) { + console.log( + `Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`, + ) + lastPercent = bytesReadPercent + } + + /** + * @type {string} + */ + let data = current + dataInput + + if (!isInsideInsert) { + const lines = data.split("\n").filter((line) => { + return line.startsWith(INSERT_INTO_START_INPUT) + }) + const [line] = lines + if (line == null) { + sqlInputFileStream.close() + return reject(new Error(`No "${INSERT_INTO_START_INPUT}" found.`)) + } + isInsideInsert = true + const lineStripped = line.slice(INSERT_INTO_START_INPUT.length) + data = lineStripped + } + + const { rows, unCompleted } = extractRowsFromSQLValues(data) + current = unCompleted + + for (const row of rows) { + if (row.length !== 12) { + sqlInputFileStream.close() + console.error([row]) + return reject(new Error(`Invalid Row values.`)) + } + + const id = Number.parseInt(row[0] ?? "0", 10) + const namespace = row[1] ?? "" + const title = row[2] ?? "" + const isRedirect = row[3] === "1" + + if (namespace === "0" && !isRedirect) { + await sqlOutputFile.appendFile(`(${id},${title}),\n`, "utf-8") + } + } + }) + .on("error", async (error) => { + await sqlOutputFile.close() + return reject(error) + }) + .on("close", async () => { + console.log(`Cleaned "${sqlInputPath}" to "${sqlOutputPath}".`) + await sqlOutputFile.appendFile(";\n", "utf-8") + await sqlOutputFile.close() + return resolve() + }) + }) +} + +await cleanPagesSQL() diff --git a/data/database-wikipedia.sh b/data/database-wikipedia.sh new file mode 100755 index 0000000..d89c02d --- /dev/null +++ b/data/database-wikipedia.sh @@ -0,0 +1,79 @@ +#!/usr/bin/env bash + +# Usage: ./database-wikipedia.sh +# Description: Download and extract Wikipedia database dumps. + +set -o errexit +set -o nounset +set -o pipefail + +DUMP_DIRECTORY="dump" +SQL_OUTPUT_DIRECTORY="sql" +DOWNLOAD_DATE="latest" +WIKIPEDIA_DUMP_URL="https://dumps.wikimedia.org/enwiki/${DOWNLOAD_DATE}/enwiki-${DOWNLOAD_DATE}-" + +mkdir --parents "${DUMP_DIRECTORY}" + +download_file() { + local filename="${1}" + local file_path_output="${DUMP_DIRECTORY}/${filename}" + local file_url="${WIKIPEDIA_DUMP_URL}${filename}" + + if [[ ! -f "${file_path_output}" ]]; then + echo "Downloading \"${filename}\" from \"${file_url}\"..." + wget --output-document="${file_path_output}" "${file_url}" + else + echo "File \"${filename}\" from \"${file_url}\" already exists." + fi +} + +# download_file "page.sql.gz" +# download_file "pagelinks.sql.gz" + +extract_file() { + local filename="${1}" + local file_path_input="${DUMP_DIRECTORY}/${filename}" + local file_path_output="${DUMP_DIRECTORY}/${filename%.gz}" + + if [[ ! -f "${file_path_output}" ]]; then + echo "Extracting \"${filename}\" to \"${file_path_output}\"..." + gzip --decompress "${file_path_input}" + + # `--keep` flag to keep the original file, not needed here. + # gzip --decompress --keep "${file_path_input}" + else + echo "File \"${filename}\" already extracted." + fi +} + +# extract_file "page.sql.gz" +# extract_file "pagelinks.sql.gz" + +# Function to clean the `page.sql` file by: +# - Removing all lines that don't start with `INSERT INTO...`. +# - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to 0. +# - Only keep columns `page_id` (1st column) and `page_title` (3rd column). +# - Replace 'INSERT INTO `page` VALUES' with 'INSERT INTO pages VALUES'. +# - Replace escape single quote `\'` in MySQL to the PostgreSQL version `''`. +# - Replace escape double quote `\"` in MySQL to the PostgreSQL version `"`. +# - Handle backslashes `\\` by replacing them with a single backslash `\` for PostgreSQL. +clean_pages_sql() { + local sql_input_file_directory="${1}" + local sql_input="${sql_input_file_directory}/page.sql" + local sql_output="${SQL_OUTPUT_DIRECTORY}/2-pages-inserts.sql" + + sed --quiet '/^INSERT INTO `page` VALUES (/p' "${sql_input}" | + grep -oP "INSERT INTO \`page\` VALUES \(.+?\);" | + sed 's/),(/)\n(/g' | + grep -P "\([0-9]+,0,'.*?',0" | + sed -E "s/^\(([0-9]+),0,'([^']*)',0.*\)$/\1,'\2'/" | + sed "s/\\\'/''/g" | # Replace escaped single quotes + sed 's/\\"/"/g' | # Replace escaped double quotes + sed 's/\\\\/\\/g' | # Replace double backslashes with a single backslash + awk 'BEGIN {print "INSERT INTO pages VALUES"} {print "(" $0 "),"}' | + sed '$ s/,$/;/g' >"$sql_output" + + echo "Cleaned \"${sql_input}\" to \"${sql_output}\"." +} + +# clean_pages_sql "${DUMP_DIRECTORY}" diff --git a/data/sql/0-insert-optimizer-start.sql b/data/sql/0-insert-optimizer-start.sql new file mode 100644 index 0000000..896385a --- /dev/null +++ b/data/sql/0-insert-optimizer-start.sql @@ -0,0 +1,4 @@ +SET AUTOCOMMIT = 0; +SET FOREIGN_KEY_CHECKS = 0; +SET UNIQUE_CHECKS = 0; +BEGIN; diff --git a/data/sql/1-pages-create.sql b/data/sql/1-pages-create.sql new file mode 100644 index 0000000..0bedf96 --- /dev/null +++ b/data/sql/1-pages-create.sql @@ -0,0 +1,9 @@ +CREATE TABLE `pages` ( + `id` INT(8) UNSIGNED NOT NULL AUTO_INCREMENT, + `title` VARBINARY(255) NOT NULL DEFAULT '', + PRIMARY KEY (`id`), + UNIQUE KEY (`title`) +) ENGINE=InnoDB AUTO_INCREMENT=77490241 DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED; + +-- VARBINARY usage instead of VARCHAR explanation: +-- > War on varchar. Changed all occurrences of varchar(N) and varchar(N) binary to varbinary(N). varchars cause problems ("Invalid mix of collations" errors) on MySQL databases with certain configs, most notably the default MySQL config. diff --git a/data/sql/99-insert-optimizer-end.sql b/data/sql/99-insert-optimizer-end.sql new file mode 100644 index 0000000..e724937 --- /dev/null +++ b/data/sql/99-insert-optimizer-end.sql @@ -0,0 +1,4 @@ +COMMIT; +SET AUTOCOMMIT = 1; +SET FOREIGN_KEY_CHECKS = 1; +SET UNIQUE_CHECKS = 1; diff --git a/data/test.js b/data/test.js new file mode 100644 index 0000000..f9e8715 --- /dev/null +++ b/data/test.js @@ -0,0 +1,48 @@ +import { extractRowsFromSQLValues } from "./utils.js" + +console.log( + "output:", + extractRowsFromSQLValues("(1,'-)',0),(2,'Demographics_of_American_Samoa',0)"), +) + +console.log( + "output:", + extractRowsFromSQLValues( + `(1,'-d\\'ff)',0),(2,'Demographics_of_American_Samoa',0)`, + ), +) + +console.log( + "output:", + extractRowsFromSQLValues( + "(1,'-)',0),(2,'Demographics_of_American_Samoa',0),(11,'abc',ddf,123,43,'dff'", + ), +) + +console.log( + "output:", + extractRowsFromSQLValues( + "(1,'-)',0),(2,'Demographics_of_American_Samoa',0),(11", + ), +) + +console.log( + "output:", + extractRowsFromSQLValues( + "(1,'-)',0),(2,'Demographics_of_American_Samoa',0),(", + ), +) + +console.log( + "output:", + extractRowsFromSQLValues( + `(1,'-)',0),(2,'C๏ผš\\\\',1,0),(2,'Demographics_of_American_Samoa',0)`, + ), +) + +console.log( + "output:", + extractRowsFromSQLValues( + `(1,'-)',0),(2,'Good_Singin\\',_Good_Playin\\'',1,0),(2,'Demographics_of_American_Samoa',0)`, + ), +) diff --git a/data/utils.js b/data/utils.js new file mode 100644 index 0000000..48f77bf --- /dev/null +++ b/data/utils.js @@ -0,0 +1,63 @@ +/** + * Extracts rows from a string of values in a SQL INSERT INTO statement, where each row is a comma-separated list of values enclosed in parentheses, possibly with the last row incomplete. + * @param {string} input + * @returns {{rows: string[][], unCompleted:string}} + * @example extractRowsFromSQLValues("(1,'-)',0),(2,'Demographics_of_American_Samoa',0)") // { rows: [["1","'-)'","0"],["2","'Demographics_of_American_Samoa'","0"]], unCompleted: "" } + */ +export const extractRowsFromSQLValues = (input) => { + const rows = [] + let index = 0 + let unCompleted = "" + + while (index < input.length) { + if (input[index] === "(") { + const row = [] + index++ // Skip the opening '(' + let value = "" + let insideQuotes = false + let rowComplete = false + + while (index < input.length && !rowComplete) { + if (input[index] === "'") { + // An escaped quote is preceded by an odd number of backslashes. + let backslashCount = 0 + let backIndex = index - 1 + while (backIndex >= 0 && input[backIndex] === "\\") { + backslashCount++ + backIndex-- + } + if (backslashCount % 2 === 0) { + insideQuotes = !insideQuotes + } + } + + if (input[index] === "," && !insideQuotes) { + row.push(value) + value = "" + } else if (input[index] === ")" && !insideQuotes) { + row.push(value) + rows.push(row) + rowComplete = true + } else { + value += input[index] + } + index++ + } + + if (!rowComplete) { + // If row is not completed, save it to unCompleted + unCompleted = "(" + if (row.length > 0) { + unCompleted += row.join(",") + "," + value + } else if (value.length > 0) { + unCompleted += value + } + break + } + } else { + index++ + } + } + + return { rows, unCompleted } +}