feat: cache.json for wikipedia links

This commit is contained in:
Théo LUDWIG 2024-07-28 17:57:41 +02:00
parent 867fc131b1
commit ccd44c10fa
Signed by: theoludwig
GPG Key ID: ADFE5A563D718F3B
9 changed files with 137 additions and 54 deletions

1
.gitignore vendored
View File

@ -21,6 +21,7 @@ build/
*.pem
.turbo
bin/
cache.json
# debug
npm-debug.log*

View File

@ -11,6 +11,8 @@
## Links
- <https://github.com/shyamupa/wikidump_preprocessing>
- <https://www.mediawiki.org/wiki/API:Allpages>
- <https://www.thewikigame.com/>
- How to get all URLs in a Wikipedia page: <https://stackoverflow.com/questions/14882571/how-to-get-all-urls-in-a-wikipedia-page>
- <https://en.wikipedia.org/w/api.php?action=query&titles=Title&prop=links&pllimit=max&format=json>

View File

@ -1,26 +1,62 @@
#!/usr/bin/env -S node --import=tsx
import type { WikipediaPagesInternalLinks } from "@repo/wikipedia-game-solver/wikipedia-api"
import { getWikipediaPageInternalLinks } from "@repo/wikipedia-game-solver/wikipedia-api"
import fs from "node:fs"
import path from "node:path"
const localeWikipedia = "en"
const cachePath = path.join(process.cwd(), "cache.json")
const fromPageInput = "Linux"
const toPageInput = "Node.js"
console.log({
fromPageInput,
toPageInput,
})
const [fromPageWikipediaLinks, toPageWikipediaLinks] = await Promise.all([
getWikipediaPageInternalLinks({
title: fromPageInput,
const fromPageInput = "New York City"
// const fromPageInput = "Linux"
// const toPageInput = "Node.js"
// console.log({
// fromPageInput,
// toPageInput,
// })
// const [fromPageWikipediaLinks, toPageWikipediaLinks] = await Promise.all([
// getWikipediaPageInternalLinks({
// title: fromPageInput,
// locale: localeWikipedia,
// }),
// getWikipediaPageInternalLinks({
// title: toPageInput,
// locale: localeWikipedia,
// }),
// ])
// console.log({
// fromPageWikipediaLinks,
// toPageWikipediaLinks,
// })
// const data = {
// [fromPageWikipediaLinks.title]: fromPageWikipediaLinks,
// [toPageWikipediaLinks.title]: toPageWikipediaLinks,
// }
const data = JSON.parse(
await fs.promises.readFile(cachePath, { encoding: "utf-8" }),
) as WikipediaPagesInternalLinks
// let maxLinks = { max: 0, title: "" }
// for (const [title, page] of Object.entries(data)) {
// if (page.links.length > maxLinks.max) {
// maxLinks = { max: page.links.length, title }
// }
// }
// console.log(maxLinks)
const pageLinks = (data[fromPageInput]?.links ?? []).slice(0, 1100)
for (const pageLink of pageLinks) {
if (pageLink in data) {
continue
}
console.log("Fetching", pageLink)
data[pageLink] = await getWikipediaPageInternalLinks({
title: pageLink,
locale: localeWikipedia,
}),
getWikipediaPageInternalLinks({
title: toPageInput,
locale: localeWikipedia,
}),
])
console.log({
fromPageWikipediaLinks,
toPageWikipediaLinks,
})
}
await fs.promises.writeFile(cachePath, JSON.stringify(data, null, 2), {
encoding: "utf-8",
})

17
data/README.md Normal file
View File

@ -0,0 +1,17 @@
# Wikipedia data
Database layout: <https://www.mediawiki.org/wiki/Manual:Database_layout>
<https://stackoverflow.com/questions/43954631/issues-with-wikipedia-dump-table-pagelinks>
<https://stackoverflow.com/questions/40384864/importing-wikipedia-dump-to-mysql>
## Dumps Links
- <https://dumps.wikimedia.org/enwiki/>
- <https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pagelinks.sql.gz>
- <https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-page.sql.gz>
- <https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-all-titles-in-ns0.gz>
- <https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-iwlinks.sql.gz>
- <https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-all-titles.gz>

View File

@ -17,6 +17,7 @@
"@repo/config-tailwind": "workspace:*",
"@repo/i18n": "workspace:*",
"@repo/ui": "workspace:*",
"ky": "catalog:",
"next": "catalog:",
"next-intl": "catalog:",
"react": "catalog:",

View File

@ -1,10 +1,8 @@
"use client"
import type { Locale } from "@repo/i18n/config"
import { Button } from "@repo/ui/design/Button"
import { Link } from "@repo/ui/design/Link"
import { Typography } from "@repo/ui/design/Typography"
import { useLocale } from "next-intl"
import { useState } from "react"
import {
fromLocaleToWikipediaLocale,
@ -15,8 +13,7 @@ import {
export const WikipediaClient: React.FC = () => {
const [isLoading, setIsLoading] = useState(false)
const localeCurrent = useLocale() as Locale
const localeWikipedia = fromLocaleToWikipediaLocale(localeCurrent)
const localeWikipedia = fromLocaleToWikipediaLocale("en-US")
const handleClick: React.MouseEventHandler<HTMLButtonElement> = async () => {
setIsLoading(true)

View File

@ -1,4 +1,5 @@
import type { Locale } from "@repo/i18n/config"
import ky from "ky"
export const sum = (a: number, b: number): number => {
return a + b
@ -99,14 +100,11 @@ export const getWikipediaPageInternalLinks = async (
if (plcontinue != null) {
url.searchParams.set("plcontinue", plcontinue)
}
const response = await fetch(url, {
return await ky
.get(url, {
method: "GET",
})
if (!response.ok) {
throw new Error(response.statusText)
}
const json = (await response.json()) as WikipediaQueryLinksResponse
return json
.json()
}
do {
@ -129,8 +127,10 @@ export const getWikipediaPageInternalLinks = async (
return link.title
}),
)
} catch {
break
} catch (error) {
console.error("Error", error)
console.error("title", title)
throw error
}
} while (plcontinue != null)
@ -152,27 +152,43 @@ export interface GetDeepWikipediaPageInternalLinksInput {
export const getDeepWikipediaPageInternalLinks = async (
input: GetDeepWikipediaPageInternalLinksInput,
): Promise<WikipediaPagesInternalLinks> => {
): Promise<void> => {
const pagesTitles = Object.keys(input.data)
await Promise.all(
pagesTitles.map(async (pageTitle) => {
for (const pageTitle of pagesTitles) {
const links = input.data[pageTitle]?.links ?? []
await Promise.all(
links.map(async (pageTitleLink) => {
for (const pageTitleLink of links) {
if (pageTitleLink in input.data) {
return
continue
}
input.data[pageTitleLink] = await getWikipediaPageInternalLinks({
locale: input.locale,
title: pageTitleLink,
})
await getDeepWikipediaPageInternalLinks({
locale: input.locale,
data: input.data,
})
}),
)
}),
)
return input.data
// await getDeepWikipediaPageInternalLinks({
// locale: input.locale,
// data: input.data,
// })
}
}
// await Promise.all(
// pagesTitles.map(async (pageTitle) => {
// const links = input.data[pageTitle]?.links ?? []
// await Promise.all(
// links.map(async (pageTitleLink) => {
// if (pageTitleLink in input.data) {
// return
// }
// input.data[pageTitleLink] = await getWikipediaPageInternalLinks({
// locale: input.locale,
// title: pageTitleLink,
// })
// await getDeepWikipediaPageInternalLinks({
// locale: input.locale,
// data: input.data,
// })
// }),
// )
// }),
// )
}

View File

@ -111,6 +111,9 @@ catalogs:
http-server:
specifier: 14.1.1
version: 14.1.1
ky:
specifier: 1.5.0
version: 1.5.0
next:
specifier: 14.2.5
version: 14.2.5
@ -626,6 +629,9 @@ importers:
'@repo/ui':
specifier: workspace:*
version: link:../ui
ky:
specifier: 'catalog:'
version: 1.5.0
next:
specifier: 'catalog:'
version: 14.2.5(@babel/core@7.24.9)(@playwright/test@1.45.3)(react-dom@18.3.1(react@18.3.1))(react@18.3.1)
@ -5448,6 +5454,10 @@ packages:
resolution: {integrity: sha512-dhG34DXATL5hSxJbIexCft8FChFXtmskoZYnoPWjXQuebWYCNkVeV3KkGegCK9CP1oswI/vQibS2GY7Em/sJJA==}
engines: {node: '>= 8'}
ky@1.5.0:
resolution: {integrity: sha512-bkQo+UqryW6Zmo/DsixYZE4Z9t2mzvNMhceyIhuMuInb3knm5Q+GNGMKveydJAj+Z6piN1SwI6eR/V0G+Z0BtA==}
engines: {node: '>=18'}
language-subtag-registry@0.3.23:
resolution: {integrity: sha512-0K65Lea881pHotoGEa5gDlMxt3pctLi2RplBb7Ezh4rRdLEOtgi7n4EwK9lamnUCkKBqaeKRVebTq6BAxSkpXQ==}
@ -14005,6 +14015,8 @@ snapshots:
klona@2.0.6: {}
ky@1.5.0: {}
language-subtag-registry@0.3.23: {}
language-tags@1.0.9:

View File

@ -5,6 +5,7 @@ packages:
catalog:
# Utils
"deepmerge": "4.3.1"
"ky": "1.5.0"
# React.js/Next.js
"next": "14.2.5"