feat: cache.json for wikipedia links

This commit is contained in:
Théo LUDWIG 2024-07-28 17:57:41 +02:00
parent 867fc131b1
commit ccd44c10fa
Signed by: theoludwig
GPG Key ID: ADFE5A563D718F3B
9 changed files with 137 additions and 54 deletions

1
.gitignore vendored
View File

@ -21,6 +21,7 @@ build/
*.pem *.pem
.turbo .turbo
bin/ bin/
cache.json
# debug # debug
npm-debug.log* npm-debug.log*

View File

@ -11,6 +11,8 @@
## Links ## Links
- <https://github.com/shyamupa/wikidump_preprocessing>
- <https://www.mediawiki.org/wiki/API:Allpages>
- <https://www.thewikigame.com/> - <https://www.thewikigame.com/>
- How to get all URLs in a Wikipedia page: <https://stackoverflow.com/questions/14882571/how-to-get-all-urls-in-a-wikipedia-page> - How to get all URLs in a Wikipedia page: <https://stackoverflow.com/questions/14882571/how-to-get-all-urls-in-a-wikipedia-page>
- <https://en.wikipedia.org/w/api.php?action=query&titles=Title&prop=links&pllimit=max&format=json> - <https://en.wikipedia.org/w/api.php?action=query&titles=Title&prop=links&pllimit=max&format=json>

View File

@ -1,26 +1,62 @@
#!/usr/bin/env -S node --import=tsx #!/usr/bin/env -S node --import=tsx
import type { WikipediaPagesInternalLinks } from "@repo/wikipedia-game-solver/wikipedia-api"
import { getWikipediaPageInternalLinks } from "@repo/wikipedia-game-solver/wikipedia-api" import { getWikipediaPageInternalLinks } from "@repo/wikipedia-game-solver/wikipedia-api"
import fs from "node:fs"
import path from "node:path"
const localeWikipedia = "en" const localeWikipedia = "en"
const cachePath = path.join(process.cwd(), "cache.json")
const fromPageInput = "Linux" const fromPageInput = "New York City"
const toPageInput = "Node.js" // const fromPageInput = "Linux"
console.log({ // const toPageInput = "Node.js"
fromPageInput, // console.log({
toPageInput, // fromPageInput,
}) // toPageInput,
const [fromPageWikipediaLinks, toPageWikipediaLinks] = await Promise.all([ // })
getWikipediaPageInternalLinks({ // const [fromPageWikipediaLinks, toPageWikipediaLinks] = await Promise.all([
title: fromPageInput, // getWikipediaPageInternalLinks({
// title: fromPageInput,
// locale: localeWikipedia,
// }),
// getWikipediaPageInternalLinks({
// title: toPageInput,
// locale: localeWikipedia,
// }),
// ])
// console.log({
// fromPageWikipediaLinks,
// toPageWikipediaLinks,
// })
// const data = {
// [fromPageWikipediaLinks.title]: fromPageWikipediaLinks,
// [toPageWikipediaLinks.title]: toPageWikipediaLinks,
// }
const data = JSON.parse(
await fs.promises.readFile(cachePath, { encoding: "utf-8" }),
) as WikipediaPagesInternalLinks
// let maxLinks = { max: 0, title: "" }
// for (const [title, page] of Object.entries(data)) {
// if (page.links.length > maxLinks.max) {
// maxLinks = { max: page.links.length, title }
// }
// }
// console.log(maxLinks)
const pageLinks = (data[fromPageInput]?.links ?? []).slice(0, 1100)
for (const pageLink of pageLinks) {
if (pageLink in data) {
continue
}
console.log("Fetching", pageLink)
data[pageLink] = await getWikipediaPageInternalLinks({
title: pageLink,
locale: localeWikipedia, locale: localeWikipedia,
}), })
getWikipediaPageInternalLinks({ }
title: toPageInput, await fs.promises.writeFile(cachePath, JSON.stringify(data, null, 2), {
locale: localeWikipedia, encoding: "utf-8",
}),
])
console.log({
fromPageWikipediaLinks,
toPageWikipediaLinks,
}) })

17
data/README.md Normal file
View File

@ -0,0 +1,17 @@
# Wikipedia data
Database layout: <https://www.mediawiki.org/wiki/Manual:Database_layout>
<https://stackoverflow.com/questions/43954631/issues-with-wikipedia-dump-table-pagelinks>
<https://stackoverflow.com/questions/40384864/importing-wikipedia-dump-to-mysql>
## Dumps Links
- <https://dumps.wikimedia.org/enwiki/>
- <https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pagelinks.sql.gz>
- <https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-page.sql.gz>
- <https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-all-titles-in-ns0.gz>
- <https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-iwlinks.sql.gz>
- <https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-all-titles.gz>

View File

@ -17,6 +17,7 @@
"@repo/config-tailwind": "workspace:*", "@repo/config-tailwind": "workspace:*",
"@repo/i18n": "workspace:*", "@repo/i18n": "workspace:*",
"@repo/ui": "workspace:*", "@repo/ui": "workspace:*",
"ky": "catalog:",
"next": "catalog:", "next": "catalog:",
"next-intl": "catalog:", "next-intl": "catalog:",
"react": "catalog:", "react": "catalog:",

View File

@ -1,10 +1,8 @@
"use client" "use client"
import type { Locale } from "@repo/i18n/config"
import { Button } from "@repo/ui/design/Button" import { Button } from "@repo/ui/design/Button"
import { Link } from "@repo/ui/design/Link" import { Link } from "@repo/ui/design/Link"
import { Typography } from "@repo/ui/design/Typography" import { Typography } from "@repo/ui/design/Typography"
import { useLocale } from "next-intl"
import { useState } from "react" import { useState } from "react"
import { import {
fromLocaleToWikipediaLocale, fromLocaleToWikipediaLocale,
@ -15,8 +13,7 @@ import {
export const WikipediaClient: React.FC = () => { export const WikipediaClient: React.FC = () => {
const [isLoading, setIsLoading] = useState(false) const [isLoading, setIsLoading] = useState(false)
const localeCurrent = useLocale() as Locale const localeWikipedia = fromLocaleToWikipediaLocale("en-US")
const localeWikipedia = fromLocaleToWikipediaLocale(localeCurrent)
const handleClick: React.MouseEventHandler<HTMLButtonElement> = async () => { const handleClick: React.MouseEventHandler<HTMLButtonElement> = async () => {
setIsLoading(true) setIsLoading(true)

View File

@ -1,4 +1,5 @@
import type { Locale } from "@repo/i18n/config" import type { Locale } from "@repo/i18n/config"
import ky from "ky"
export const sum = (a: number, b: number): number => { export const sum = (a: number, b: number): number => {
return a + b return a + b
@ -99,14 +100,11 @@ export const getWikipediaPageInternalLinks = async (
if (plcontinue != null) { if (plcontinue != null) {
url.searchParams.set("plcontinue", plcontinue) url.searchParams.set("plcontinue", plcontinue)
} }
const response = await fetch(url, { return await ky
.get(url, {
method: "GET", method: "GET",
}) })
if (!response.ok) { .json()
throw new Error(response.statusText)
}
const json = (await response.json()) as WikipediaQueryLinksResponse
return json
} }
do { do {
@ -129,8 +127,10 @@ export const getWikipediaPageInternalLinks = async (
return link.title return link.title
}), }),
) )
} catch { } catch (error) {
break console.error("Error", error)
console.error("title", title)
throw error
} }
} while (plcontinue != null) } while (plcontinue != null)
@ -152,27 +152,43 @@ export interface GetDeepWikipediaPageInternalLinksInput {
export const getDeepWikipediaPageInternalLinks = async ( export const getDeepWikipediaPageInternalLinks = async (
input: GetDeepWikipediaPageInternalLinksInput, input: GetDeepWikipediaPageInternalLinksInput,
): Promise<WikipediaPagesInternalLinks> => { ): Promise<void> => {
const pagesTitles = Object.keys(input.data) const pagesTitles = Object.keys(input.data)
await Promise.all( for (const pageTitle of pagesTitles) {
pagesTitles.map(async (pageTitle) => {
const links = input.data[pageTitle]?.links ?? [] const links = input.data[pageTitle]?.links ?? []
await Promise.all( for (const pageTitleLink of links) {
links.map(async (pageTitleLink) => {
if (pageTitleLink in input.data) { if (pageTitleLink in input.data) {
return continue
} }
input.data[pageTitleLink] = await getWikipediaPageInternalLinks({ input.data[pageTitleLink] = await getWikipediaPageInternalLinks({
locale: input.locale, locale: input.locale,
title: pageTitleLink, title: pageTitleLink,
}) })
await getDeepWikipediaPageInternalLinks({ // await getDeepWikipediaPageInternalLinks({
locale: input.locale, // locale: input.locale,
data: input.data, // data: input.data,
}) // })
}), }
) }
}),
) // await Promise.all(
return input.data // pagesTitles.map(async (pageTitle) => {
// const links = input.data[pageTitle]?.links ?? []
// await Promise.all(
// links.map(async (pageTitleLink) => {
// if (pageTitleLink in input.data) {
// return
// }
// input.data[pageTitleLink] = await getWikipediaPageInternalLinks({
// locale: input.locale,
// title: pageTitleLink,
// })
// await getDeepWikipediaPageInternalLinks({
// locale: input.locale,
// data: input.data,
// })
// }),
// )
// }),
// )
} }

View File

@ -111,6 +111,9 @@ catalogs:
http-server: http-server:
specifier: 14.1.1 specifier: 14.1.1
version: 14.1.1 version: 14.1.1
ky:
specifier: 1.5.0
version: 1.5.0
next: next:
specifier: 14.2.5 specifier: 14.2.5
version: 14.2.5 version: 14.2.5
@ -626,6 +629,9 @@ importers:
'@repo/ui': '@repo/ui':
specifier: workspace:* specifier: workspace:*
version: link:../ui version: link:../ui
ky:
specifier: 'catalog:'
version: 1.5.0
next: next:
specifier: 'catalog:' specifier: 'catalog:'
version: 14.2.5(@babel/core@7.24.9)(@playwright/test@1.45.3)(react-dom@18.3.1(react@18.3.1))(react@18.3.1) version: 14.2.5(@babel/core@7.24.9)(@playwright/test@1.45.3)(react-dom@18.3.1(react@18.3.1))(react@18.3.1)
@ -5448,6 +5454,10 @@ packages:
resolution: {integrity: sha512-dhG34DXATL5hSxJbIexCft8FChFXtmskoZYnoPWjXQuebWYCNkVeV3KkGegCK9CP1oswI/vQibS2GY7Em/sJJA==} resolution: {integrity: sha512-dhG34DXATL5hSxJbIexCft8FChFXtmskoZYnoPWjXQuebWYCNkVeV3KkGegCK9CP1oswI/vQibS2GY7Em/sJJA==}
engines: {node: '>= 8'} engines: {node: '>= 8'}
ky@1.5.0:
resolution: {integrity: sha512-bkQo+UqryW6Zmo/DsixYZE4Z9t2mzvNMhceyIhuMuInb3knm5Q+GNGMKveydJAj+Z6piN1SwI6eR/V0G+Z0BtA==}
engines: {node: '>=18'}
language-subtag-registry@0.3.23: language-subtag-registry@0.3.23:
resolution: {integrity: sha512-0K65Lea881pHotoGEa5gDlMxt3pctLi2RplBb7Ezh4rRdLEOtgi7n4EwK9lamnUCkKBqaeKRVebTq6BAxSkpXQ==} resolution: {integrity: sha512-0K65Lea881pHotoGEa5gDlMxt3pctLi2RplBb7Ezh4rRdLEOtgi7n4EwK9lamnUCkKBqaeKRVebTq6BAxSkpXQ==}
@ -14005,6 +14015,8 @@ snapshots:
klona@2.0.6: {} klona@2.0.6: {}
ky@1.5.0: {}
language-subtag-registry@0.3.23: {} language-subtag-registry@0.3.23: {}
language-tags@1.0.9: language-tags@1.0.9:

View File

@ -5,6 +5,7 @@ packages:
catalog: catalog:
# Utils # Utils
"deepmerge": "4.3.1" "deepmerge": "4.3.1"
"ky": "1.5.0"
# React.js/Next.js # React.js/Next.js
"next": "14.2.5" "next": "14.2.5"