164 lines
5.4 KiB
Markdown
164 lines
5.4 KiB
Markdown
# Wikipedia data
|
|
|
|
```sh
|
|
./download-wikipedia-dump.sh
|
|
node --max-old-space-size=8096 generate-sql-files.js
|
|
|
|
# Inside the Database container
|
|
docker exec -it wikipedia-solver-dev-database sh
|
|
/data/execute-sql.sh
|
|
```
|
|
|
|
## Utils
|
|
|
|
Show the first 10 line of sql file: `head -n 10 ./dump/page.sql`
|
|
|
|
Show the first 10 characters of sql file: `head -c 10 ./dump/page.sql`
|
|
|
|
To inspect volume size used by database: `docker system df -v`
|
|
|
|
## Remove a volume
|
|
|
|
```sh
|
|
# List all volumes
|
|
docker volume ls
|
|
|
|
# Remove a volume
|
|
docker volume rm data_wikipedia-solver-mariadb-data
|
|
|
|
# Or by using docker compose down
|
|
docker-compose down --volumes
|
|
```
|
|
|
|
## PostgreSQL Related
|
|
|
|
<https://stackoverflow.com/questions/12206600/how-to-speed-up-insertion-performance-in-postgresql>
|
|
|
|
```sh
|
|
docker exec -it wikipedia-solver-dev-database sh
|
|
|
|
psql --username="${DATABASE_USER}" --dbname="${DATABASE_NAME}"
|
|
```
|
|
|
|
```sql
|
|
-- Execute script with inserts
|
|
\i /data/sql-pages-inserts/0001-pages-inserts.sql
|
|
|
|
/data/sql-internal-links-inserts/0001-internal-links.sh
|
|
```
|
|
|
|
## Dumps Links
|
|
|
|
- Database layout: <https://www.mediawiki.org/wiki/Manual:Database_layout>
|
|
- <https://en.wikipedia.org/wiki/Wikipedia:Database_download>
|
|
- <https://dumps.wikimedia.org/enwiki/>
|
|
- Run SQL queries against Wikipedia: <https://quarry.wmcloud.org/>
|
|
|
|
```sql
|
|
-- Get the sanitized title of a page linked in the page with title 'Node.js'
|
|
SELECT lt.lt_title FROM linktarget lt WHERE lt.lt_id = (
|
|
SELECT pl.pl_target_id FROM pagelinks pl WHERE pl.pl_from = (
|
|
SELECT p.page_id FROM page p WHERE p.page_title = 'Node.js' AND p.page_namespace = 0
|
|
) LIMIT 1
|
|
);
|
|
```
|
|
|
|
## `page.sql.gz` - MySQL full version up until inserts
|
|
|
|
```sql
|
|
-- MySQL dump 10.19 Distrib 10.3.38-MariaDB, for debian-linux-gnu (x86_64)
|
|
--
|
|
-- Host: db1206 Database: enwiki
|
|
-- ------------------------------------------------------
|
|
-- Server version 10.4.26-MariaDB-log
|
|
|
|
/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
|
|
/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
|
|
/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
|
|
/*!40101 SET NAMES utf8mb4 */;
|
|
/*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
|
|
/*!40103 SET TIME_ZONE='+00:00' */;
|
|
/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */;
|
|
/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
|
|
/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
|
|
/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
|
|
|
|
--
|
|
-- Table structure for table `page`
|
|
--
|
|
|
|
DROP TABLE IF EXISTS `page`;
|
|
/*!40101 SET @saved_cs_client = @@character_set_client */;
|
|
/*!40101 SET character_set_client = utf8 */;
|
|
CREATE TABLE `page` (
|
|
`page_id` int(8) unsigned NOT NULL AUTO_INCREMENT,
|
|
`page_namespace` int(11) NOT NULL DEFAULT 0,
|
|
`page_title` varbinary(255) NOT NULL DEFAULT '',
|
|
`page_is_redirect` tinyint(1) unsigned NOT NULL DEFAULT 0,
|
|
`page_is_new` tinyint(1) unsigned NOT NULL DEFAULT 0,
|
|
`page_random` double unsigned NOT NULL DEFAULT 0,
|
|
`page_touched` binary(14) NOT NULL,
|
|
`page_links_updated` varbinary(14) DEFAULT NULL,
|
|
`page_latest` int(8) unsigned NOT NULL DEFAULT 0,
|
|
`page_len` int(8) unsigned NOT NULL DEFAULT 0,
|
|
`page_content_model` varbinary(32) DEFAULT NULL,
|
|
`page_lang` varbinary(35) DEFAULT NULL,
|
|
PRIMARY KEY (`page_id`),
|
|
UNIQUE KEY `page_name_title` (`page_namespace`,`page_title`),
|
|
KEY `page_random` (`page_random`),
|
|
KEY `page_len` (`page_len`),
|
|
KEY `page_redirect_namespace_len` (`page_is_redirect`,`page_namespace`,`page_len`)
|
|
) ENGINE=InnoDB AUTO_INCREMENT=76684425 DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED;
|
|
/*!40101 SET character_set_client = @saved_cs_client */;
|
|
|
|
--
|
|
-- Dumping data for table `page`
|
|
--
|
|
```
|
|
|
|
## `pagelinks.sql.gz` - MySQL full version up until inserts
|
|
|
|
```sql
|
|
-- MySQL dump 10.19 Distrib 10.3.38-MariaDB, for debian-linux-gnu (x86_64)
|
|
--
|
|
-- Host: db1206 Database: enwiki
|
|
-- ------------------------------------------------------
|
|
-- Server version 10.4.26-MariaDB-log
|
|
|
|
/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
|
|
/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
|
|
/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
|
|
/*!40101 SET NAMES utf8mb4 */;
|
|
/*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
|
|
/*!40103 SET TIME_ZONE='+00:00' */;
|
|
/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */;
|
|
/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
|
|
/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
|
|
/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
|
|
|
|
--
|
|
-- Table structure for table `pagelinks`
|
|
--
|
|
|
|
DROP TABLE IF EXISTS `pagelinks`;
|
|
/*!40101 SET @saved_cs_client = @@character_set_client */;
|
|
/*!40101 SET character_set_client = utf8 */;
|
|
CREATE TABLE `pagelinks` (
|
|
`pl_from` int(8) unsigned NOT NULL DEFAULT 0,
|
|
`pl_namespace` int(11) NOT NULL DEFAULT 0,
|
|
`pl_title` varbinary(255) NOT NULL DEFAULT '',
|
|
`pl_from_namespace` int(11) NOT NULL DEFAULT 0,
|
|
`pl_target_id` bigint(20) unsigned DEFAULT NULL,
|
|
PRIMARY KEY (`pl_from`,`pl_namespace`,`pl_title`),
|
|
KEY `pl_namespace` (`pl_namespace`,`pl_title`,`pl_from`),
|
|
KEY `pl_backlinks_namespace` (`pl_from_namespace`,`pl_namespace`,`pl_title`,`pl_from`),
|
|
KEY `pl_target_id` (`pl_target_id`,`pl_from`),
|
|
KEY `pl_backlinks_namespace_target_id` (`pl_from_namespace`,`pl_target_id`,`pl_from`)
|
|
) ENGINE=InnoDB DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED;
|
|
/*!40101 SET character_set_client = @saved_cs_client */;
|
|
|
|
--
|
|
-- Dumping data for table `pagelinks`
|
|
--
|
|
```
|