# Wikipedia data ```sh ./download-wikipedia-dump.sh node --max-old-space-size=10096 database-wikipedia.js ``` ## Utils Show the first 10 line of sql file: `head -n 10 ./dump/page.sql` Show the first 10 characters of sql file: `head -c 10 ./dump/page.sql` To inspect volume size used by database: `docker system df -v | grep 'wikipedia-solver-mariadb-data'` ## Remove a volume ```sh # List all volumes docker volume ls # Remove a volume docker volume rm data_wikipedia-solver-mariadb-data # Or by using docker compose down docker-compose down --volumes ``` ## MySQL Related MySQL any way to import a huge (32 GB) sql dump faster?: Import data.sql MySQL Docker Container: ## Dumps Links - Database layout: - - - Run SQL queries against Wikipedia: ```sql -- Get the sanitized title of a page linked in the page with title 'Node.js' SELECT lt.lt_title FROM linktarget lt WHERE lt.lt_id = ( SELECT pl.pl_target_id FROM pagelinks pl WHERE pl.pl_from = ( SELECT p.page_id FROM page p WHERE p.page_title = 'Node.js' AND p.page_namespace = 0 ) LIMIT 1 ); ``` ## `page.sql.gz` - MySQL full version up until inserts ```sql -- MySQL dump 10.19 Distrib 10.3.38-MariaDB, for debian-linux-gnu (x86_64) -- -- Host: db1206 Database: enwiki -- ------------------------------------------------------ -- Server version 10.4.26-MariaDB-log /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; /*!40101 SET NAMES utf8mb4 */; /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; /*!40103 SET TIME_ZONE='+00:00' */; /*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */; /*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; -- -- Table structure for table `page` -- DROP TABLE IF EXISTS `page`; /*!40101 SET @saved_cs_client = @@character_set_client */; /*!40101 SET character_set_client = utf8 */; CREATE TABLE `page` ( `page_id` int(8) unsigned NOT NULL AUTO_INCREMENT, `page_namespace` int(11) NOT NULL DEFAULT 0, `page_title` varbinary(255) NOT NULL DEFAULT '', `page_is_redirect` tinyint(1) unsigned NOT NULL DEFAULT 0, `page_is_new` tinyint(1) unsigned NOT NULL DEFAULT 0, `page_random` double unsigned NOT NULL DEFAULT 0, `page_touched` binary(14) NOT NULL, `page_links_updated` varbinary(14) DEFAULT NULL, `page_latest` int(8) unsigned NOT NULL DEFAULT 0, `page_len` int(8) unsigned NOT NULL DEFAULT 0, `page_content_model` varbinary(32) DEFAULT NULL, `page_lang` varbinary(35) DEFAULT NULL, PRIMARY KEY (`page_id`), UNIQUE KEY `page_name_title` (`page_namespace`,`page_title`), KEY `page_random` (`page_random`), KEY `page_len` (`page_len`), KEY `page_redirect_namespace_len` (`page_is_redirect`,`page_namespace`,`page_len`) ) ENGINE=InnoDB AUTO_INCREMENT=76684425 DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED; /*!40101 SET character_set_client = @saved_cs_client */; -- -- Dumping data for table `page` -- ``` ## `pagelinks.sql.gz` - MySQL full version up until inserts ```sql -- MySQL dump 10.19 Distrib 10.3.38-MariaDB, for debian-linux-gnu (x86_64) -- -- Host: db1206 Database: enwiki -- ------------------------------------------------------ -- Server version 10.4.26-MariaDB-log /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; /*!40101 SET NAMES utf8mb4 */; /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; /*!40103 SET TIME_ZONE='+00:00' */; /*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */; /*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; -- -- Table structure for table `pagelinks` -- DROP TABLE IF EXISTS `pagelinks`; /*!40101 SET @saved_cs_client = @@character_set_client */; /*!40101 SET character_set_client = utf8 */; CREATE TABLE `pagelinks` ( `pl_from` int(8) unsigned NOT NULL DEFAULT 0, `pl_namespace` int(11) NOT NULL DEFAULT 0, `pl_title` varbinary(255) NOT NULL DEFAULT '', `pl_from_namespace` int(11) NOT NULL DEFAULT 0, `pl_target_id` bigint(20) unsigned DEFAULT NULL, PRIMARY KEY (`pl_from`,`pl_namespace`,`pl_title`), KEY `pl_namespace` (`pl_namespace`,`pl_title`,`pl_from`), KEY `pl_backlinks_namespace` (`pl_from_namespace`,`pl_namespace`,`pl_title`,`pl_from`), KEY `pl_target_id` (`pl_target_id`,`pl_from`), KEY `pl_backlinks_namespace_target_id` (`pl_from_namespace`,`pl_target_id`,`pl_from`) ) ENGINE=InnoDB DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED; /*!40101 SET character_set_client = @saved_cs_client */; -- -- Dumping data for table `pagelinks` -- ```