feat: cache.json for wikipedia links
This commit is contained in:
parent
867fc131b1
commit
ccd44c10fa
1
.gitignore
vendored
1
.gitignore
vendored
@ -21,6 +21,7 @@ build/
|
||||
*.pem
|
||||
.turbo
|
||||
bin/
|
||||
cache.json
|
||||
|
||||
# debug
|
||||
npm-debug.log*
|
||||
|
2
TODO.md
2
TODO.md
@ -11,6 +11,8 @@
|
||||
|
||||
## Links
|
||||
|
||||
- <https://github.com/shyamupa/wikidump_preprocessing>
|
||||
- <https://www.mediawiki.org/wiki/API:Allpages>
|
||||
- <https://www.thewikigame.com/>
|
||||
- How to get all URLs in a Wikipedia page: <https://stackoverflow.com/questions/14882571/how-to-get-all-urls-in-a-wikipedia-page>
|
||||
- <https://en.wikipedia.org/w/api.php?action=query&titles=Title&prop=links&pllimit=max&format=json>
|
||||
|
@ -1,26 +1,62 @@
|
||||
#!/usr/bin/env -S node --import=tsx
|
||||
|
||||
import type { WikipediaPagesInternalLinks } from "@repo/wikipedia-game-solver/wikipedia-api"
|
||||
import { getWikipediaPageInternalLinks } from "@repo/wikipedia-game-solver/wikipedia-api"
|
||||
import fs from "node:fs"
|
||||
import path from "node:path"
|
||||
|
||||
const localeWikipedia = "en"
|
||||
const cachePath = path.join(process.cwd(), "cache.json")
|
||||
|
||||
const fromPageInput = "Linux"
|
||||
const toPageInput = "Node.js"
|
||||
console.log({
|
||||
fromPageInput,
|
||||
toPageInput,
|
||||
})
|
||||
const [fromPageWikipediaLinks, toPageWikipediaLinks] = await Promise.all([
|
||||
getWikipediaPageInternalLinks({
|
||||
title: fromPageInput,
|
||||
const fromPageInput = "New York City"
|
||||
// const fromPageInput = "Linux"
|
||||
// const toPageInput = "Node.js"
|
||||
// console.log({
|
||||
// fromPageInput,
|
||||
// toPageInput,
|
||||
// })
|
||||
// const [fromPageWikipediaLinks, toPageWikipediaLinks] = await Promise.all([
|
||||
// getWikipediaPageInternalLinks({
|
||||
// title: fromPageInput,
|
||||
// locale: localeWikipedia,
|
||||
// }),
|
||||
// getWikipediaPageInternalLinks({
|
||||
// title: toPageInput,
|
||||
// locale: localeWikipedia,
|
||||
// }),
|
||||
// ])
|
||||
// console.log({
|
||||
// fromPageWikipediaLinks,
|
||||
// toPageWikipediaLinks,
|
||||
// })
|
||||
// const data = {
|
||||
// [fromPageWikipediaLinks.title]: fromPageWikipediaLinks,
|
||||
// [toPageWikipediaLinks.title]: toPageWikipediaLinks,
|
||||
// }
|
||||
|
||||
const data = JSON.parse(
|
||||
await fs.promises.readFile(cachePath, { encoding: "utf-8" }),
|
||||
) as WikipediaPagesInternalLinks
|
||||
|
||||
// let maxLinks = { max: 0, title: "" }
|
||||
// for (const [title, page] of Object.entries(data)) {
|
||||
// if (page.links.length > maxLinks.max) {
|
||||
// maxLinks = { max: page.links.length, title }
|
||||
// }
|
||||
// }
|
||||
// console.log(maxLinks)
|
||||
|
||||
const pageLinks = (data[fromPageInput]?.links ?? []).slice(0, 1100)
|
||||
for (const pageLink of pageLinks) {
|
||||
if (pageLink in data) {
|
||||
continue
|
||||
}
|
||||
console.log("Fetching", pageLink)
|
||||
data[pageLink] = await getWikipediaPageInternalLinks({
|
||||
title: pageLink,
|
||||
locale: localeWikipedia,
|
||||
}),
|
||||
getWikipediaPageInternalLinks({
|
||||
title: toPageInput,
|
||||
locale: localeWikipedia,
|
||||
}),
|
||||
])
|
||||
console.log({
|
||||
fromPageWikipediaLinks,
|
||||
toPageWikipediaLinks,
|
||||
})
|
||||
}
|
||||
await fs.promises.writeFile(cachePath, JSON.stringify(data, null, 2), {
|
||||
encoding: "utf-8",
|
||||
})
|
||||
|
17
data/README.md
Normal file
17
data/README.md
Normal file
@ -0,0 +1,17 @@
|
||||
# Wikipedia data
|
||||
|
||||
Database layout: <https://www.mediawiki.org/wiki/Manual:Database_layout>
|
||||
|
||||
<https://stackoverflow.com/questions/43954631/issues-with-wikipedia-dump-table-pagelinks>
|
||||
|
||||
<https://stackoverflow.com/questions/40384864/importing-wikipedia-dump-to-mysql>
|
||||
|
||||
## Dumps Links
|
||||
|
||||
- <https://dumps.wikimedia.org/enwiki/>
|
||||
|
||||
- <https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pagelinks.sql.gz>
|
||||
- <https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-page.sql.gz>
|
||||
- <https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-all-titles-in-ns0.gz>
|
||||
- <https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-iwlinks.sql.gz>
|
||||
- <https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-all-titles.gz>
|
@ -17,6 +17,7 @@
|
||||
"@repo/config-tailwind": "workspace:*",
|
||||
"@repo/i18n": "workspace:*",
|
||||
"@repo/ui": "workspace:*",
|
||||
"ky": "catalog:",
|
||||
"next": "catalog:",
|
||||
"next-intl": "catalog:",
|
||||
"react": "catalog:",
|
||||
|
@ -1,10 +1,8 @@
|
||||
"use client"
|
||||
|
||||
import type { Locale } from "@repo/i18n/config"
|
||||
import { Button } from "@repo/ui/design/Button"
|
||||
import { Link } from "@repo/ui/design/Link"
|
||||
import { Typography } from "@repo/ui/design/Typography"
|
||||
import { useLocale } from "next-intl"
|
||||
import { useState } from "react"
|
||||
import {
|
||||
fromLocaleToWikipediaLocale,
|
||||
@ -15,8 +13,7 @@ import {
|
||||
export const WikipediaClient: React.FC = () => {
|
||||
const [isLoading, setIsLoading] = useState(false)
|
||||
|
||||
const localeCurrent = useLocale() as Locale
|
||||
const localeWikipedia = fromLocaleToWikipediaLocale(localeCurrent)
|
||||
const localeWikipedia = fromLocaleToWikipediaLocale("en-US")
|
||||
|
||||
const handleClick: React.MouseEventHandler<HTMLButtonElement> = async () => {
|
||||
setIsLoading(true)
|
||||
|
@ -1,4 +1,5 @@
|
||||
import type { Locale } from "@repo/i18n/config"
|
||||
import ky from "ky"
|
||||
|
||||
export const sum = (a: number, b: number): number => {
|
||||
return a + b
|
||||
@ -99,14 +100,11 @@ export const getWikipediaPageInternalLinks = async (
|
||||
if (plcontinue != null) {
|
||||
url.searchParams.set("plcontinue", plcontinue)
|
||||
}
|
||||
const response = await fetch(url, {
|
||||
method: "GET",
|
||||
})
|
||||
if (!response.ok) {
|
||||
throw new Error(response.statusText)
|
||||
}
|
||||
const json = (await response.json()) as WikipediaQueryLinksResponse
|
||||
return json
|
||||
return await ky
|
||||
.get(url, {
|
||||
method: "GET",
|
||||
})
|
||||
.json()
|
||||
}
|
||||
|
||||
do {
|
||||
@ -129,8 +127,10 @@ export const getWikipediaPageInternalLinks = async (
|
||||
return link.title
|
||||
}),
|
||||
)
|
||||
} catch {
|
||||
break
|
||||
} catch (error) {
|
||||
console.error("Error", error)
|
||||
console.error("title", title)
|
||||
throw error
|
||||
}
|
||||
} while (plcontinue != null)
|
||||
|
||||
@ -152,27 +152,43 @@ export interface GetDeepWikipediaPageInternalLinksInput {
|
||||
|
||||
export const getDeepWikipediaPageInternalLinks = async (
|
||||
input: GetDeepWikipediaPageInternalLinksInput,
|
||||
): Promise<WikipediaPagesInternalLinks> => {
|
||||
): Promise<void> => {
|
||||
const pagesTitles = Object.keys(input.data)
|
||||
await Promise.all(
|
||||
pagesTitles.map(async (pageTitle) => {
|
||||
const links = input.data[pageTitle]?.links ?? []
|
||||
await Promise.all(
|
||||
links.map(async (pageTitleLink) => {
|
||||
if (pageTitleLink in input.data) {
|
||||
return
|
||||
}
|
||||
input.data[pageTitleLink] = await getWikipediaPageInternalLinks({
|
||||
locale: input.locale,
|
||||
title: pageTitleLink,
|
||||
})
|
||||
await getDeepWikipediaPageInternalLinks({
|
||||
locale: input.locale,
|
||||
data: input.data,
|
||||
})
|
||||
}),
|
||||
)
|
||||
}),
|
||||
)
|
||||
return input.data
|
||||
for (const pageTitle of pagesTitles) {
|
||||
const links = input.data[pageTitle]?.links ?? []
|
||||
for (const pageTitleLink of links) {
|
||||
if (pageTitleLink in input.data) {
|
||||
continue
|
||||
}
|
||||
input.data[pageTitleLink] = await getWikipediaPageInternalLinks({
|
||||
locale: input.locale,
|
||||
title: pageTitleLink,
|
||||
})
|
||||
// await getDeepWikipediaPageInternalLinks({
|
||||
// locale: input.locale,
|
||||
// data: input.data,
|
||||
// })
|
||||
}
|
||||
}
|
||||
|
||||
// await Promise.all(
|
||||
// pagesTitles.map(async (pageTitle) => {
|
||||
// const links = input.data[pageTitle]?.links ?? []
|
||||
// await Promise.all(
|
||||
// links.map(async (pageTitleLink) => {
|
||||
// if (pageTitleLink in input.data) {
|
||||
// return
|
||||
// }
|
||||
// input.data[pageTitleLink] = await getWikipediaPageInternalLinks({
|
||||
// locale: input.locale,
|
||||
// title: pageTitleLink,
|
||||
// })
|
||||
// await getDeepWikipediaPageInternalLinks({
|
||||
// locale: input.locale,
|
||||
// data: input.data,
|
||||
// })
|
||||
// }),
|
||||
// )
|
||||
// }),
|
||||
// )
|
||||
}
|
||||
|
@ -111,6 +111,9 @@ catalogs:
|
||||
http-server:
|
||||
specifier: 14.1.1
|
||||
version: 14.1.1
|
||||
ky:
|
||||
specifier: 1.5.0
|
||||
version: 1.5.0
|
||||
next:
|
||||
specifier: 14.2.5
|
||||
version: 14.2.5
|
||||
@ -626,6 +629,9 @@ importers:
|
||||
'@repo/ui':
|
||||
specifier: workspace:*
|
||||
version: link:../ui
|
||||
ky:
|
||||
specifier: 'catalog:'
|
||||
version: 1.5.0
|
||||
next:
|
||||
specifier: 'catalog:'
|
||||
version: 14.2.5(@babel/core@7.24.9)(@playwright/test@1.45.3)(react-dom@18.3.1(react@18.3.1))(react@18.3.1)
|
||||
@ -5448,6 +5454,10 @@ packages:
|
||||
resolution: {integrity: sha512-dhG34DXATL5hSxJbIexCft8FChFXtmskoZYnoPWjXQuebWYCNkVeV3KkGegCK9CP1oswI/vQibS2GY7Em/sJJA==}
|
||||
engines: {node: '>= 8'}
|
||||
|
||||
ky@1.5.0:
|
||||
resolution: {integrity: sha512-bkQo+UqryW6Zmo/DsixYZE4Z9t2mzvNMhceyIhuMuInb3knm5Q+GNGMKveydJAj+Z6piN1SwI6eR/V0G+Z0BtA==}
|
||||
engines: {node: '>=18'}
|
||||
|
||||
language-subtag-registry@0.3.23:
|
||||
resolution: {integrity: sha512-0K65Lea881pHotoGEa5gDlMxt3pctLi2RplBb7Ezh4rRdLEOtgi7n4EwK9lamnUCkKBqaeKRVebTq6BAxSkpXQ==}
|
||||
|
||||
@ -14005,6 +14015,8 @@ snapshots:
|
||||
|
||||
klona@2.0.6: {}
|
||||
|
||||
ky@1.5.0: {}
|
||||
|
||||
language-subtag-registry@0.3.23: {}
|
||||
|
||||
language-tags@1.0.9:
|
||||
|
@ -5,6 +5,7 @@ packages:
|
||||
catalog:
|
||||
# Utils
|
||||
"deepmerge": "4.3.1"
|
||||
"ky": "1.5.0"
|
||||
|
||||
# React.js/Next.js
|
||||
"next": "14.2.5"
|
||||
|
Reference in New Issue
Block a user