2024-07-28 17:57:41 +02:00
# Wikipedia data
2024-08-05 14:04:28 +02:00
```sh
./download-wikipedia-dump.sh
2024-08-07 01:21:08 +02:00
node --max-old-space-size=8096 database-wikipedia.js
2024-08-05 14:04:28 +02:00
```
2024-08-05 00:37:06 +02:00
## Utils
Show the first 10 line of sql file: `head -n 10 ./dump/page.sql`
2024-08-05 17:36:19 +02:00
Show the first 10 characters of sql file: `head -c 10 ./dump/page.sql`
2024-08-05 00:52:48 +02:00
To inspect volume size used by database: `docker system df -v | grep 'wikipedia-solver-mariadb-data'`
2024-08-05 00:37:06 +02:00
2024-08-07 01:21:08 +02:00
To enter in the database container: `docker exec -it wikipedia-solver-database sh`
Then: `mariadb --password="${DATABASE_PASSWORD}" --user="${DATABASE_USER}"`
And `use wikipedia;` , for example: `SELECT * FROM pages LIMIT 10;` or to execute a SQL script: `source /docker-entrypoint-initdb.d/3-internal-links-inserts.sql;` .
2024-08-05 00:52:48 +02:00
## Remove a volume
2024-08-05 00:37:06 +02:00
```sh
# List all volumes
docker volume ls
# Remove a volume
2024-08-05 00:52:48 +02:00
docker volume rm data_wikipedia-solver-mariadb-data
# Or by using docker compose down
docker-compose down --volumes
2024-08-05 00:37:06 +02:00
```
## MySQL Related
2024-07-28 17:57:41 +02:00
< https: / / stackoverflow . com / questions / 43954631 / issues-with-wikipedia-dump-table-pagelinks >
2024-08-05 00:37:06 +02:00
MySQL any way to import a huge (32 GB) sql dump faster?: < https: / / stackoverflow . com / questions / 40384864 / importing-wikipedia-dump-to-mysql >
Import data.sql MySQL Docker Container: < https: / / stackoverflow . com / questions / 43880026 / import-data-sql-mysql-docker-container >
< https: / / dba . stackexchange . com / questions / 83125 / mysql-any-way-to-import-a-huge-32-gb-sql-dump-faster >
2024-07-28 17:57:41 +02:00
## Dumps Links
2024-08-05 00:37:06 +02:00
- Database layout: < https: // www . mediawiki . org / wiki / Manual:Database_layout >
- < https: // en . wikipedia . org / wiki / Wikipedia:Database_download >
2024-07-28 17:57:41 +02:00
- < https: // dumps . wikimedia . org / enwiki />
2024-08-05 14:04:28 +02:00
- Run SQL queries against Wikipedia: < https: // quarry . wmcloud . org />
2024-07-28 17:57:41 +02:00
2024-08-05 14:04:28 +02:00
```sql
-- Get the sanitized title of a page linked in the page with title 'Node.js'
SELECT lt.lt_title FROM linktarget lt WHERE lt.lt_id = (
SELECT pl.pl_target_id FROM pagelinks pl WHERE pl.pl_from = (
SELECT p.page_id FROM page p WHERE p.page_title = 'Node.js' AND p.page_namespace = 0
) LIMIT 1
);
```
2024-08-05 00:37:06 +02:00
2024-08-05 14:04:28 +02:00
## `page.sql.gz` - MySQL full version up until inserts
2024-08-05 00:37:06 +02:00
```sql
2024-08-05 14:04:28 +02:00
-- MySQL dump 10.19 Distrib 10.3.38-MariaDB, for debian-linux-gnu (x86_64)
2024-08-05 00:37:06 +02:00
--
-- Host: db1206 Database: enwiki
-- ------------------------------------------------------
2024-08-05 14:04:28 +02:00
-- Server version 10.4.26-MariaDB-log
2024-08-05 00:37:06 +02:00
/*!40101 SET @OLD_CHARACTER_SET_CLIENT =@@CHARACTER_SET_CLIENT */;
/*!40101 SET @OLD_CHARACTER_SET_RESULTS =@@CHARACTER_SET_RESULTS */;
/*!40101 SET @OLD_COLLATION_CONNECTION =@@COLLATION_CONNECTION */;
/*!40101 SET NAMES utf8mb4 */;
/*!40103 SET @OLD_TIME_ZONE =@@TIME_ZONE */;
/*!40103 SET TIME_ZONE='+00:00' */;
/*!40014 SET @OLD_UNIQUE_CHECKS =@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */;
/*!40014 SET @OLD_FOREIGN_KEY_CHECKS =@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
/*!40101 SET @OLD_SQL_MODE =@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
/*!40111 SET @OLD_SQL_NOTES =@@SQL_NOTES, SQL_NOTES=0 */;
--
-- Table structure for table `page`
--
DROP TABLE IF EXISTS `page` ;
/*!40101 SET @saved_cs_client = @@character_set_client */;
/*!40101 SET character_set_client = utf8 */;
CREATE TABLE `page` (
`page_id` int(8) unsigned NOT NULL AUTO_INCREMENT,
`page_namespace` int(11) NOT NULL DEFAULT 0,
`page_title` varbinary(255) NOT NULL DEFAULT '',
`page_is_redirect` tinyint(1) unsigned NOT NULL DEFAULT 0,
`page_is_new` tinyint(1) unsigned NOT NULL DEFAULT 0,
`page_random` double unsigned NOT NULL DEFAULT 0,
`page_touched` binary(14) NOT NULL,
`page_links_updated` varbinary(14) DEFAULT NULL,
`page_latest` int(8) unsigned NOT NULL DEFAULT 0,
`page_len` int(8) unsigned NOT NULL DEFAULT 0,
`page_content_model` varbinary(32) DEFAULT NULL,
`page_lang` varbinary(35) DEFAULT NULL,
PRIMARY KEY (`page_id`),
UNIQUE KEY `page_name_title` (`page_namespace`,`page_title`),
KEY `page_random` (`page_random`),
KEY `page_len` (`page_len`),
KEY `page_redirect_namespace_len` (`page_is_redirect`,`page_namespace`,`page_len`)
2024-08-05 14:04:28 +02:00
) ENGINE=InnoDB AUTO_INCREMENT=76684425 DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED;
2024-08-05 00:37:06 +02:00
/*!40101 SET character_set_client = @saved_cs_client */;
--
-- Dumping data for table `page`
--
```
2024-08-05 14:04:28 +02:00
## `pagelinks.sql.gz` - MySQL full version up until inserts
```sql
-- MySQL dump 10.19 Distrib 10.3.38-MariaDB, for debian-linux-gnu (x86_64)
--
-- Host: db1206 Database: enwiki
-- ------------------------------------------------------
-- Server version 10.4.26-MariaDB-log
/*!40101 SET @OLD_CHARACTER_SET_CLIENT =@@CHARACTER_SET_CLIENT */;
/*!40101 SET @OLD_CHARACTER_SET_RESULTS =@@CHARACTER_SET_RESULTS */;
/*!40101 SET @OLD_COLLATION_CONNECTION =@@COLLATION_CONNECTION */;
/*!40101 SET NAMES utf8mb4 */;
/*!40103 SET @OLD_TIME_ZONE =@@TIME_ZONE */;
/*!40103 SET TIME_ZONE='+00:00' */;
/*!40014 SET @OLD_UNIQUE_CHECKS =@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */;
/*!40014 SET @OLD_FOREIGN_KEY_CHECKS =@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
/*!40101 SET @OLD_SQL_MODE =@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
/*!40111 SET @OLD_SQL_NOTES =@@SQL_NOTES, SQL_NOTES=0 */;
--
-- Table structure for table `pagelinks`
--
DROP TABLE IF EXISTS `pagelinks` ;
/*!40101 SET @saved_cs_client = @@character_set_client */;
/*!40101 SET character_set_client = utf8 */;
CREATE TABLE `pagelinks` (
`pl_from` int(8) unsigned NOT NULL DEFAULT 0,
`pl_namespace` int(11) NOT NULL DEFAULT 0,
`pl_title` varbinary(255) NOT NULL DEFAULT '',
`pl_from_namespace` int(11) NOT NULL DEFAULT 0,
`pl_target_id` bigint(20) unsigned DEFAULT NULL,
PRIMARY KEY (`pl_from`,`pl_namespace`,`pl_title`),
KEY `pl_namespace` (`pl_namespace`,`pl_title`,`pl_from`),
KEY `pl_backlinks_namespace` (`pl_from_namespace`,`pl_namespace`,`pl_title`,`pl_from`),
KEY `pl_target_id` (`pl_target_id`,`pl_from`),
KEY `pl_backlinks_namespace_target_id` (`pl_from_namespace`,`pl_target_id`,`pl_from`)
) ENGINE=InnoDB DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED;
/*!40101 SET character_set_client = @saved_cs_client */;
--
-- Dumping data for table `pagelinks`
--
```