feat(api): create Lucid models and migrations for Wikipedia database dump + usage of PostgreSQL instead of MariaDB

This commit is contained in:
Théo LUDWIG 2024-08-11 09:42:42 +01:00
parent aa2fb4f5b9
commit 02ee112de4
Signed by: theoludwig
GPG Key ID: ADFE5A563D718F3B
31 changed files with 400 additions and 535 deletions

10
.gitignore vendored
View File

@ -21,11 +21,13 @@ build/
*.pem
.turbo
tmp/
cache.json
# data
data/dump
data/sql/*
!data/sql/0000-tables-create.sql
!data/sql/0999-constraints.sql
data/sql-pages-inserts/*
!data/sql-pages-inserts/0000-pages.sh
data/sql-internal-links-inserts/*
!data/sql-internal-links-inserts/0000-internal-links.sh
# debug
npm-debug.log*

View File

@ -30,7 +30,7 @@
- [ ] Handle redirects
- [ ] Implement REST API (`api`) with JSON responses ([AdonisJS](https://adonisjs.com/)) to get shortest paths between 2 pages
- [x] Init AdonisJS project
- [ ] Create Lucid models and migrations for Wikipedia Database Dump: `pages` and `internal_links` tables
- [x] Create Lucid models and migrations for Wikipedia Database Dump: `pages` and `internal_links` tables
- [ ] Implement `GET /wikipedia/pages?title=Node.js` to search a page by title (not necessarily with the title sanitized, search with input by user to check if page exists)
- [ ] Implement `GET /wikipedia/pages/internal-links/paths?from=Node.js&to=Linux` to get all the possible paths between 2 pages with titles sanitized
- [ ] Implement Wikipedia Game Solver (`website`)

View File

@ -23,7 +23,7 @@
"@repo/wikipedia-game-solver": "workspace:*",
"@vinejs/vine": "catalog:",
"luxon": "catalog:",
"mysql2": "catalog:",
"pg": "catalog:",
"reflect-metadata": "catalog:",
"tsx": "catalog:",
"pino-pretty": "catalog:"

View File

@ -13,9 +13,9 @@ export default class HttpExceptionHandler extends ExceptionHandler {
*/
public override async handle(
error: unknown,
ctx: HttpContext,
context: HttpContext,
): Promise<unknown> {
return await super.handle(error, ctx)
return await super.handle(error, context)
}
/**
@ -25,8 +25,8 @@ export default class HttpExceptionHandler extends ExceptionHandler {
*/
public override async report(
error: unknown,
ctx: HttpContext,
context: HttpContext,
): Promise<void> {
return await super.report(error, ctx)
return await super.report(error, context)
}
}

View File

@ -13,13 +13,13 @@ export default class AuthMiddleware {
redirectTo = "/login"
public async handle(
ctx: HttpContext,
context: HttpContext,
next: NextFn,
options: {
guards?: Array<keyof Authenticators>
} = {},
): Promise<void> {
await ctx.auth.authenticateUsing(options.guards, {
await context.auth.authenticateUsing(options.guards, {
loginRoute: this.redirectTo,
})
return next()

View File

@ -5,13 +5,13 @@ import type { NextFn } from "@adonisjs/core/types/http"
/**
* The container bindings middleware binds classes to their request specific value using the container resolver.
*
* - We bind "HttpContext" class to the "ctx" object.
* - And bind "Logger" class to the "ctx.logger" object.
* - We bind "HttpContext" class to the "context" object.
* - And bind "Logger" class to the "context.logger" object.
*/
export default class ContainerBindingsMiddleware {
public async handle(ctx: HttpContext, next: NextFn): Promise<void> {
ctx.containerResolver.bindValue(HttpContext, ctx)
ctx.containerResolver.bindValue(Logger, ctx.logger)
public async handle(context: HttpContext, next: NextFn): Promise<void> {
context.containerResolver.bindValue(HttpContext, context)
context.containerResolver.bindValue(Logger, context.logger)
return next()
}

View File

@ -0,0 +1,30 @@
import { BaseModel, column, manyToMany } from "@adonisjs/lucid/orm"
import type { ManyToMany } from "@adonisjs/lucid/types/relations"
export default class Page extends BaseModel {
protected tableName = "pages"
@column({ columnName: "id", serializeAs: "id", isPrimary: true })
declare id: number
@column({
columnName: "title",
serializeAs: "title",
})
declare title: string
@manyToMany(
() => {
return Page
},
{
pivotTable: "internal_links",
localKey: "id",
relatedKey: "id",
pivotForeignKey: "from_page_id",
pivotRelatedForeignKey: "to_page_id",
serializeAs: "internalLinks",
},
)
declare internalLinks: ManyToMany<typeof Page>
}

View File

@ -25,7 +25,7 @@ export default class User extends compose(BaseModel, AuthFinder) {
columnName: "full_name",
serializeAs: "fullName",
})
declare fullName: string | null
declare fullName: string
@column({
columnName: "email",
@ -49,7 +49,7 @@ export default class User extends compose(BaseModel, AuthFinder) {
autoCreate: true,
autoUpdate: true,
})
declare updatedAt: DateTime | null
declare updatedAt: DateTime
static accessTokens = DbAccessTokensProvider.forModel(User)
}

View File

@ -2,10 +2,10 @@ import env from "#start/env.js"
import { defineConfig } from "@adonisjs/lucid"
const databaseConfig = defineConfig({
connection: "mysql",
connection: "postgres",
connections: {
mysql: {
client: "mysql2",
postgres: {
client: "pg",
connection: {
host: env.get("DATABASE_HOST"),
port: env.get("DATABASE_PORT"),

View File

@ -6,12 +6,12 @@ export default class CreateUsersTable extends BaseSchema {
public override async up(): Promise<void> {
void this.schema.createTable(this.tableName, (table) => {
table.increments("id").notNullable()
table.string("full_name").nullable()
table.string("full_name").notNullable()
table.string("email", 254).notNullable().unique()
table.string("password").notNullable()
table.timestamp("created_at").notNullable()
table.timestamp("updated_at").nullable()
table.timestamp("updated_at").notNullable()
})
}

View File

@ -10,9 +10,9 @@ export default class CreateAccessTokensTable extends BaseSchema {
.integer("tokenable_id")
.notNullable()
.unsigned()
.references("id")
.inTable("users")
.references("users.id")
.onDelete("CASCADE")
.onUpdate("CASCADE")
table.string("type").notNullable()
table.string("name").nullable()

View File

@ -0,0 +1,16 @@
import { BaseSchema } from "@adonisjs/lucid/schema"
export default class CreatePagesTable extends BaseSchema {
protected tableName = "pages"
public override async up(): Promise<void> {
void this.schema.createTable(this.tableName, (table) => {
table.increments("id").notNullable()
table.string("title", 255).notNullable().unique()
})
}
public override async down(): Promise<void> {
void this.schema.dropTable(this.tableName)
}
}

View File

@ -0,0 +1,29 @@
import { BaseSchema } from "@adonisjs/lucid/schema"
export default class CreateInternalLinksTable extends BaseSchema {
protected tableName = "internal_links"
public override async up(): Promise<void> {
void this.schema.createTable(this.tableName, (table) => {
table.primary(["from_page_id", "to_page_id"])
table
.integer("from_page_id")
.unsigned()
.notNullable()
.references("pages.id")
.onDelete("CASCADE")
.onUpdate("CASCADE")
table
.integer("to_page_id")
.unsigned()
.notNullable()
.references("pages.id")
.onDelete("CASCADE")
.onUpdate("CASCADE")
})
}
public override async down(): Promise<void> {
void this.schema.dropTable(this.tableName)
}
}

View File

@ -1,27 +1,20 @@
services:
wikipedia-solver-dev-database:
container_name: "wikipedia-solver-dev-database"
image: "mariadb:10.6.17"
image: "postgres:16.3"
restart: "unless-stopped"
env_file: ".env"
environment:
MARIADB_USER: ${DATABASE_USER}
MARIADB_PASSWORD: ${DATABASE_PASSWORD}
MARIADB_ROOT_PASSWORD: ${DATABASE_PASSWORD}
MARIADB_DATABASE: ${DATABASE_NAME}
POSTGRES_USER: ${DATABASE_USER}
POSTGRES_PASSWORD: ${DATABASE_PASSWORD}
POSTGRES_DB: ${DATABASE_NAME}
command: |
--innodb_buffer_pool_size=4G
--key-buffer-size=4G
--innodb_log_buffer_size=256M
--innodb_log_file_size=1G
--innodb_write_io_threads=16
--innodb_flush_log_at_trx_commit=0
--max_allowed_packet=1G
--max_wal_size=4GB
ports:
- "${DATABASE_PORT-3306}:${DATABASE_PORT-3306}"
- "${DATABASE_PORT-5432}:${DATABASE_PORT-5432}"
volumes:
- "wikipedia-solver-dev-mariadb-data:/var/lib/mysql"
# - "./sql:/docker-entrypoint-initdb.d/"
- "wikipedia-solver-dev-postgres-data:/var/lib/postgresql/data"
- "./data:/data/"
wikipedia-solver-dev-adminer:
container_name: "wikipedia-solver-dev-adminer"
@ -38,4 +31,4 @@ services:
- "./data/adminer/fonts/:/var/www/html/fonts"
volumes:
wikipedia-solver-dev-mariadb-data:
wikipedia-solver-dev-postgres-data:

View File

@ -27,27 +27,20 @@ services:
wikipedia-solver-database:
container_name: "wikipedia-solver-database"
image: "mariadb:10.6.17"
image: "postgres:16.3"
restart: "unless-stopped"
env_file: ".env"
environment:
MARIADB_USER: ${DATABASE_USER}
MARIADB_PASSWORD: ${DATABASE_PASSWORD}
MARIADB_ROOT_PASSWORD: ${DATABASE_PASSWORD}
MARIADB_DATABASE: ${DATABASE_NAME}
POSTGRES_USER: ${DATABASE_USER}
POSTGRES_PASSWORD: ${DATABASE_PASSWORD}
POSTGRES_DB: ${DATABASE_NAME}
command: |
--innodb_buffer_pool_size=4G
--key-buffer-size=4G
--innodb_log_buffer_size=256M
--innodb_log_file_size=1G
--innodb_write_io_threads=16
--innodb_flush_log_at_trx_commit=0
--max_allowed_packet=1G
--max_wal_size=4GB
ports:
- "${DATABASE_PORT-3306}:${DATABASE_PORT-3306}"
- "${DATABASE_PORT-5432}:${DATABASE_PORT-5432}"
volumes:
- "wikipedia-solver-mariadb-data:/var/lib/mysql"
# - "./sql:/docker-entrypoint-initdb.d/"
- "wikipedia-solver-postgres-data:/var/lib/postgresql/data"
- "./data:/data/"
volumes:
wikipedia-solver-mariadb-data:
wikipedia-solver-postgres-data:

View File

@ -1,3 +0,0 @@
DATABASE_USER=wikipedia_user
DATABASE_PASSWORD=password
DATABASE_NAME=wikipedia

View File

@ -2,7 +2,11 @@
```sh
./download-wikipedia-dump.sh
node --max-old-space-size=8096 database-wikipedia.js
node --max-old-space-size=8096 generate-sql-files.js
# Inside the Database container
docker exec -it wikipedia-solver-dev-database sh
/data/execute-sql.sh
```
## Utils
@ -11,13 +15,7 @@ Show the first 10 line of sql file: `head -n 10 ./dump/page.sql`
Show the first 10 characters of sql file: `head -c 10 ./dump/page.sql`
To inspect volume size used by database: `docker system df -v | grep 'wikipedia-solver-mariadb-data'`
To enter in the database container: `docker exec -it wikipedia-solver-database sh`
Then: `mariadb --password="${DATABASE_PASSWORD}" --user="${DATABASE_USER}"`
And `use wikipedia;`, for example: `SELECT * FROM pages LIMIT 10;` or to execute a SQL script: `source /docker-entrypoint-initdb.d/3-internal-links-inserts.sql;`.
To inspect volume size used by database: `docker system df -v`
## Remove a volume
@ -32,15 +30,22 @@ docker volume rm data_wikipedia-solver-mariadb-data
docker-compose down --volumes
```
## MySQL Related
## PostgreSQL Related
<https://stackoverflow.com/questions/43954631/issues-with-wikipedia-dump-table-pagelinks>
<https://stackoverflow.com/questions/12206600/how-to-speed-up-insertion-performance-in-postgresql>
MySQL any way to import a huge (32 GB) sql dump faster?: <https://stackoverflow.com/questions/40384864/importing-wikipedia-dump-to-mysql>
```sh
docker exec -it wikipedia-solver-dev-database sh
Import data.sql MySQL Docker Container: <https://stackoverflow.com/questions/43880026/import-data-sql-mysql-docker-container>
psql --username="${DATABASE_USER}" --dbname="${DATABASE_NAME}"
```
<https://dba.stackexchange.com/questions/83125/mysql-any-way-to-import-a-huge-32-gb-sql-dump-faster>
```sql
-- Execute script with inserts
\i /data/sql-pages-inserts/0001-pages-inserts.sql
/data/sql-internal-links-inserts/0001-internal-links.sh
```
## Dumps Links

View File

@ -1,39 +0,0 @@
services:
wikipedia-solver-database:
container_name: "wikipedia-solver-database"
image: "mariadb:10.6.17"
restart: "unless-stopped"
env_file: ".env"
environment:
MARIADB_USER: ${DATABASE_USER}
MARIADB_PASSWORD: ${DATABASE_PASSWORD}
MARIADB_ROOT_PASSWORD: ${DATABASE_PASSWORD}
MARIADB_DATABASE: ${DATABASE_NAME}
command: |
--innodb_buffer_pool_size=4G
--key-buffer-size=4G
--innodb_log_buffer_size=256M
--innodb_log_file_size=1G
--innodb_write_io_threads=16
--innodb_flush_log_at_trx_commit=0
--max_allowed_packet=1G
volumes:
- "wikipedia-solver-mariadb-data:/var/lib/mysql"
- "./sql:/docker-entrypoint-initdb.d/"
adminer:
container_name: "adminer"
image: "adminer:4.8.1"
restart: "unless-stopped"
ports:
- "8080:8080"
env_file: ".env"
environment:
ADMINER_DEFAULT_SERVER: "wikipedia-solver-database"
volumes:
- "./adminer/default-orange.css:/var/www/html/adminer.css"
- "./adminer/logo.png:/var/www/html/logo.png"
- "./adminer/fonts/:/var/www/html/fonts"
volumes:
wikipedia-solver-mariadb-data:

8
data/execute-sql.sh Executable file
View File

@ -0,0 +1,8 @@
#!/usr/bin/env bash
/data/sql/0000-sql-init.sh
/data/sql-pages-inserts/0000-pages.sh
/data/sql-internal-links-inserts/0000-internal-links.sh
/data/sql/0999-sql-end.sh

View File

@ -7,7 +7,6 @@ import {
} from "./utils.js"
const SQL_DUMP_PATH = path.join(process.cwd(), "dump")
const SQL_OUTPUT_PATH = path.join(process.cwd(), "sql")
const SQL_FILENAME_NUMBER_PAD = 4
/**
@ -52,9 +51,42 @@ const cleanPagesSQL = async () => {
let current = ""
let lastPercent = 0
let pagesFileCount = 1
const INSERT_INTO_START_OUTPUT = "INSERT INTO pages (id, title) VALUES "
const BATCH_SIZE = 1_000_000
/**
* @type {string[]}
*/
let batch = []
const flushBatch = async () => {
if (batch.length > 0) {
const batchString = batch.join(",")
const fileName = `${zeroPad(pagesFileCount, SQL_FILENAME_NUMBER_PAD)}-pages-inserts.sql`
const sqlOutputPath = path.join(
process.cwd(),
"sql-pages-inserts",
fileName,
)
await fs.promises.writeFile(
sqlOutputPath,
`${INSERT_INTO_START_OUTPUT}${batchString};`,
{
encoding: "utf-8",
},
)
console.log(`flushBatch - ${fileName}, batch.length: ${batch.length}`)
pagesFileCount += 1
batch = []
}
}
return await new Promise((resolve, reject) => {
sqlInputFileStream
.on("data", (dataInput) => {
.on("data", async (dataInput) => {
const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size
const bytesReadPercent = bytesReadRatio * 100
@ -98,13 +130,21 @@ const cleanPagesSQL = async () => {
if (namespace === "0" && !isRedirect) {
wikipediaPagesKeyId[id] = title
batch.push(`(${id},E${title})`)
}
}
if (batch.length >= BATCH_SIZE) {
sqlInputFileStream.pause()
await flushBatch()
sqlInputFileStream.resume()
}
})
.on("error", (error) => {
return reject(error)
})
.on("close", () => {
.on("close", async () => {
await flushBatch()
console.log("cleanPagesSQL - Bytes read (100%).")
return resolve(wikipediaPagesKeyId)
})
@ -113,30 +153,6 @@ const cleanPagesSQL = async () => {
const wikipediaPagesKeyId = await cleanPagesSQL()
const cleanPagesSQLWriteToFile = async () => {
console.log("cleanPagesSQLWriteToFile - Writing to file...")
const sqlOutputPath = path.join(
SQL_OUTPUT_PATH,
`${zeroPad(1, SQL_FILENAME_NUMBER_PAD)}-pages-inserts.sql`,
)
const INSERT_INTO_START_OUTPUT = "INSERT INTO pages (id, title) VALUES "
const wikipediaPagesString = Object.entries(wikipediaPagesKeyId)
.map(([id, title]) => {
return `(${id},${title})`
})
.join(",")
await fs.promises.writeFile(
sqlOutputPath,
`${INSERT_INTO_START_OUTPUT}${wikipediaPagesString};`,
{ encoding: "utf-8" },
)
console.log("cleanPagesSQLWriteToFile - Done.")
}
await cleanPagesSQLWriteToFile()
/**
* Function to clean the `pagelinks.sql` file by:
* - Removing all lines that don't start with `INSERT INTO...`.
@ -145,7 +161,7 @@ await cleanPagesSQLWriteToFile()
* @returns {Promise<void>}
*/
const cleanInternalLinksSQL = async () => {
let internalLinksFileCount = 2
let internalLinksFileCount = 1
const INSERT_INTO_START_OUTPUT =
"INSERT INTO internal_links (from_page_id, to_page_id) VALUES "
@ -174,7 +190,11 @@ const cleanInternalLinksSQL = async () => {
if (batch.length > 0) {
const batchString = batch.join(",")
const fileName = `${zeroPad(internalLinksFileCount, SQL_FILENAME_NUMBER_PAD)}-internal-links-inserts.sql`
const sqlOutputPath = path.join(SQL_OUTPUT_PATH, fileName)
const sqlOutputPath = path.join(
process.cwd(),
"sql-internal-links-inserts",
fileName,
)
await fs.promises.writeFile(
sqlOutputPath,
`${INSERT_INTO_START_OUTPUT}${batchString};`,

View File

@ -0,0 +1,6 @@
#!/usr/bin/env bash
for sqlInsert in /data/sql-internal-links-inserts/*.sql; do
echo "${sqlInsert}"
time psql --username="${DATABASE_USER}" --dbname="${DATABASE_NAME}" --file="${sqlInsert}"
done

View File

@ -0,0 +1,6 @@
#!/usr/bin/env bash
for sqlInsert in /data/sql-pages-inserts/*.sql; do
echo "${sqlInsert}"
time psql --username="${DATABASE_USER}" --dbname="${DATABASE_NAME}" --file="${sqlInsert}"
done

3
data/sql/0000-sql-init.sh Executable file
View File

@ -0,0 +1,3 @@
#!/usr/bin/env bash
time psql --username="${DATABASE_USER}" --dbname="${DATABASE_NAME}" --file="/data/sql/0000-sql-init.sql"

View File

@ -0,0 +1,2 @@
ALTER TABLE pages DISABLE TRIGGER ALL;
ALTER TABLE internal_links DISABLE TRIGGER ALL;

View File

@ -1,28 +0,0 @@
CREATE TABLE `pages` (
`id` INT(8) UNSIGNED NOT NULL AUTO_INCREMENT,
`title` VARBINARY(255) NOT NULL DEFAULT '',
-- `is_redirect` tinyint(1) unsigned NOT NULL DEFAULT 0,
PRIMARY KEY (`id`),
UNIQUE KEY (`title`)
) ENGINE=MyISAM AUTO_INCREMENT=76684425 DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED;
-- VARBINARY usage instead of VARCHAR explanation: <https://stackoverflow.com/a/13397437>
-- > War on varchar. Changed all occurrences of varchar(N) and varchar(N) binary to varbinary(N). varchars cause problems ("Invalid mix of collations" errors) on MySQL databases with certain configs, most notably the default MySQL config.
CREATE TABLE `internal_links` (
-- `id` INT(8) UNSIGNED NOT NULL AUTO_INCREMENT,
`from_page_id` INT(8) UNSIGNED NOT NULL,
`to_page_id` INT(8) UNSIGNED NOT NULL,
-- PRIMARY KEY (`id`)
PRIMARY KEY (`from_page_id`, `to_page_id`),
FOREIGN KEY (`from_page_id`) REFERENCES `pages` (`id`) ON DELETE CASCADE,
FOREIGN KEY (`to_page_id`) REFERENCES `pages` (`id`) ON DELETE CASCADE
) ENGINE=MyISAM DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED;
SET @@session.unique_checks = 0;
SET @@session.foreign_key_checks = 0;
SET FOREIGN_KEY_CHECKS = 0;
SET UNIQUE_CHECKS = 0;

View File

@ -1,11 +0,0 @@
-- SET @@session.foreign_key_checks = 0;
-- SET FOREIGN_KEY_CHECKS = 0;
-- ALTER TABLE `internal_links` ADD CONSTRAINT fk_from_page_id FOREIGN KEY (`from_page_id`) REFERENCES `pages` (`id`);
-- ALTER TABLE `internal_links` ADD CONSTRAINT fk_to_page_id FOREIGN KEY (`to_page_id`) REFERENCES `pages` (`id`);
SET @@session.unique_checks = 1;
SET @@session.foreign_key_checks = 1;
SET FOREIGN_KEY_CHECKS = 1;
SET UNIQUE_CHECKS = 1;

3
data/sql/0999-sql-end.sh Executable file
View File

@ -0,0 +1,3 @@
#!/usr/bin/env bash
time psql --username="${DATABASE_USER}" --dbname="${DATABASE_NAME}" --file="/data/sql/0999-sql-end.sql"

View File

@ -0,0 +1,2 @@
ALTER TABLE pages ENABLE TRIGGER ALL;
ALTER TABLE internal_links ENABLE TRIGGER ALL;

View File

@ -26,7 +26,7 @@
"editorconfig-checker": "5.1.8",
"playwright": "catalog:",
"prettier": "3.3.3",
"prettier-plugin-tailwindcss": "0.6.5",
"prettier-plugin-tailwindcss": "0.6.6",
"replace-in-files-cli": "3.0.0",
"semantic-release": "23.1.1",
"turbo": "2.0.12",

File diff suppressed because it is too large Load Diff

View File

@ -21,7 +21,7 @@ catalog:
# TypeScript
"typescript": "5.5.4"
"@total-typescript/ts-reset": "0.5.1"
"@types/node": "22.1.0"
"@types/node": "22.2.0"
"tsx": "4.17.0"
# AdonisJS
@ -29,7 +29,7 @@ catalog:
"@adonisjs/core": "6.12.1"
"@adonisjs/cors": "2.2.1"
"@adonisjs/lucid": "21.2.0"
"mysql2": "3.11.0"
"pg": "8.12.0"
"@adonisjs/assembler": "7.7.0"
"@vinejs/vine": "2.1.0"
"luxon": "3.5.0"