feat: cache.json for wikipedia links
This commit is contained in:
parent
867fc131b1
commit
ccd44c10fa
1
.gitignore
vendored
1
.gitignore
vendored
@ -21,6 +21,7 @@ build/
|
|||||||
*.pem
|
*.pem
|
||||||
.turbo
|
.turbo
|
||||||
bin/
|
bin/
|
||||||
|
cache.json
|
||||||
|
|
||||||
# debug
|
# debug
|
||||||
npm-debug.log*
|
npm-debug.log*
|
||||||
|
2
TODO.md
2
TODO.md
@ -11,6 +11,8 @@
|
|||||||
|
|
||||||
## Links
|
## Links
|
||||||
|
|
||||||
|
- <https://github.com/shyamupa/wikidump_preprocessing>
|
||||||
|
- <https://www.mediawiki.org/wiki/API:Allpages>
|
||||||
- <https://www.thewikigame.com/>
|
- <https://www.thewikigame.com/>
|
||||||
- How to get all URLs in a Wikipedia page: <https://stackoverflow.com/questions/14882571/how-to-get-all-urls-in-a-wikipedia-page>
|
- How to get all URLs in a Wikipedia page: <https://stackoverflow.com/questions/14882571/how-to-get-all-urls-in-a-wikipedia-page>
|
||||||
- <https://en.wikipedia.org/w/api.php?action=query&titles=Title&prop=links&pllimit=max&format=json>
|
- <https://en.wikipedia.org/w/api.php?action=query&titles=Title&prop=links&pllimit=max&format=json>
|
||||||
|
@ -1,26 +1,62 @@
|
|||||||
#!/usr/bin/env -S node --import=tsx
|
#!/usr/bin/env -S node --import=tsx
|
||||||
|
|
||||||
|
import type { WikipediaPagesInternalLinks } from "@repo/wikipedia-game-solver/wikipedia-api"
|
||||||
import { getWikipediaPageInternalLinks } from "@repo/wikipedia-game-solver/wikipedia-api"
|
import { getWikipediaPageInternalLinks } from "@repo/wikipedia-game-solver/wikipedia-api"
|
||||||
|
import fs from "node:fs"
|
||||||
|
import path from "node:path"
|
||||||
|
|
||||||
const localeWikipedia = "en"
|
const localeWikipedia = "en"
|
||||||
|
const cachePath = path.join(process.cwd(), "cache.json")
|
||||||
|
|
||||||
const fromPageInput = "Linux"
|
const fromPageInput = "New York City"
|
||||||
const toPageInput = "Node.js"
|
// const fromPageInput = "Linux"
|
||||||
console.log({
|
// const toPageInput = "Node.js"
|
||||||
fromPageInput,
|
// console.log({
|
||||||
toPageInput,
|
// fromPageInput,
|
||||||
})
|
// toPageInput,
|
||||||
const [fromPageWikipediaLinks, toPageWikipediaLinks] = await Promise.all([
|
// })
|
||||||
getWikipediaPageInternalLinks({
|
// const [fromPageWikipediaLinks, toPageWikipediaLinks] = await Promise.all([
|
||||||
title: fromPageInput,
|
// getWikipediaPageInternalLinks({
|
||||||
|
// title: fromPageInput,
|
||||||
|
// locale: localeWikipedia,
|
||||||
|
// }),
|
||||||
|
// getWikipediaPageInternalLinks({
|
||||||
|
// title: toPageInput,
|
||||||
|
// locale: localeWikipedia,
|
||||||
|
// }),
|
||||||
|
// ])
|
||||||
|
// console.log({
|
||||||
|
// fromPageWikipediaLinks,
|
||||||
|
// toPageWikipediaLinks,
|
||||||
|
// })
|
||||||
|
// const data = {
|
||||||
|
// [fromPageWikipediaLinks.title]: fromPageWikipediaLinks,
|
||||||
|
// [toPageWikipediaLinks.title]: toPageWikipediaLinks,
|
||||||
|
// }
|
||||||
|
|
||||||
|
const data = JSON.parse(
|
||||||
|
await fs.promises.readFile(cachePath, { encoding: "utf-8" }),
|
||||||
|
) as WikipediaPagesInternalLinks
|
||||||
|
|
||||||
|
// let maxLinks = { max: 0, title: "" }
|
||||||
|
// for (const [title, page] of Object.entries(data)) {
|
||||||
|
// if (page.links.length > maxLinks.max) {
|
||||||
|
// maxLinks = { max: page.links.length, title }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// console.log(maxLinks)
|
||||||
|
|
||||||
|
const pageLinks = (data[fromPageInput]?.links ?? []).slice(0, 1100)
|
||||||
|
for (const pageLink of pageLinks) {
|
||||||
|
if (pageLink in data) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
console.log("Fetching", pageLink)
|
||||||
|
data[pageLink] = await getWikipediaPageInternalLinks({
|
||||||
|
title: pageLink,
|
||||||
locale: localeWikipedia,
|
locale: localeWikipedia,
|
||||||
}),
|
})
|
||||||
getWikipediaPageInternalLinks({
|
}
|
||||||
title: toPageInput,
|
await fs.promises.writeFile(cachePath, JSON.stringify(data, null, 2), {
|
||||||
locale: localeWikipedia,
|
encoding: "utf-8",
|
||||||
}),
|
|
||||||
])
|
|
||||||
console.log({
|
|
||||||
fromPageWikipediaLinks,
|
|
||||||
toPageWikipediaLinks,
|
|
||||||
})
|
})
|
||||||
|
17
data/README.md
Normal file
17
data/README.md
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
# Wikipedia data
|
||||||
|
|
||||||
|
Database layout: <https://www.mediawiki.org/wiki/Manual:Database_layout>
|
||||||
|
|
||||||
|
<https://stackoverflow.com/questions/43954631/issues-with-wikipedia-dump-table-pagelinks>
|
||||||
|
|
||||||
|
<https://stackoverflow.com/questions/40384864/importing-wikipedia-dump-to-mysql>
|
||||||
|
|
||||||
|
## Dumps Links
|
||||||
|
|
||||||
|
- <https://dumps.wikimedia.org/enwiki/>
|
||||||
|
|
||||||
|
- <https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pagelinks.sql.gz>
|
||||||
|
- <https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-page.sql.gz>
|
||||||
|
- <https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-all-titles-in-ns0.gz>
|
||||||
|
- <https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-iwlinks.sql.gz>
|
||||||
|
- <https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-all-titles.gz>
|
@ -17,6 +17,7 @@
|
|||||||
"@repo/config-tailwind": "workspace:*",
|
"@repo/config-tailwind": "workspace:*",
|
||||||
"@repo/i18n": "workspace:*",
|
"@repo/i18n": "workspace:*",
|
||||||
"@repo/ui": "workspace:*",
|
"@repo/ui": "workspace:*",
|
||||||
|
"ky": "catalog:",
|
||||||
"next": "catalog:",
|
"next": "catalog:",
|
||||||
"next-intl": "catalog:",
|
"next-intl": "catalog:",
|
||||||
"react": "catalog:",
|
"react": "catalog:",
|
||||||
|
@ -1,10 +1,8 @@
|
|||||||
"use client"
|
"use client"
|
||||||
|
|
||||||
import type { Locale } from "@repo/i18n/config"
|
|
||||||
import { Button } from "@repo/ui/design/Button"
|
import { Button } from "@repo/ui/design/Button"
|
||||||
import { Link } from "@repo/ui/design/Link"
|
import { Link } from "@repo/ui/design/Link"
|
||||||
import { Typography } from "@repo/ui/design/Typography"
|
import { Typography } from "@repo/ui/design/Typography"
|
||||||
import { useLocale } from "next-intl"
|
|
||||||
import { useState } from "react"
|
import { useState } from "react"
|
||||||
import {
|
import {
|
||||||
fromLocaleToWikipediaLocale,
|
fromLocaleToWikipediaLocale,
|
||||||
@ -15,8 +13,7 @@ import {
|
|||||||
export const WikipediaClient: React.FC = () => {
|
export const WikipediaClient: React.FC = () => {
|
||||||
const [isLoading, setIsLoading] = useState(false)
|
const [isLoading, setIsLoading] = useState(false)
|
||||||
|
|
||||||
const localeCurrent = useLocale() as Locale
|
const localeWikipedia = fromLocaleToWikipediaLocale("en-US")
|
||||||
const localeWikipedia = fromLocaleToWikipediaLocale(localeCurrent)
|
|
||||||
|
|
||||||
const handleClick: React.MouseEventHandler<HTMLButtonElement> = async () => {
|
const handleClick: React.MouseEventHandler<HTMLButtonElement> = async () => {
|
||||||
setIsLoading(true)
|
setIsLoading(true)
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import type { Locale } from "@repo/i18n/config"
|
import type { Locale } from "@repo/i18n/config"
|
||||||
|
import ky from "ky"
|
||||||
|
|
||||||
export const sum = (a: number, b: number): number => {
|
export const sum = (a: number, b: number): number => {
|
||||||
return a + b
|
return a + b
|
||||||
@ -99,14 +100,11 @@ export const getWikipediaPageInternalLinks = async (
|
|||||||
if (plcontinue != null) {
|
if (plcontinue != null) {
|
||||||
url.searchParams.set("plcontinue", plcontinue)
|
url.searchParams.set("plcontinue", plcontinue)
|
||||||
}
|
}
|
||||||
const response = await fetch(url, {
|
return await ky
|
||||||
|
.get(url, {
|
||||||
method: "GET",
|
method: "GET",
|
||||||
})
|
})
|
||||||
if (!response.ok) {
|
.json()
|
||||||
throw new Error(response.statusText)
|
|
||||||
}
|
|
||||||
const json = (await response.json()) as WikipediaQueryLinksResponse
|
|
||||||
return json
|
|
||||||
}
|
}
|
||||||
|
|
||||||
do {
|
do {
|
||||||
@ -129,8 +127,10 @@ export const getWikipediaPageInternalLinks = async (
|
|||||||
return link.title
|
return link.title
|
||||||
}),
|
}),
|
||||||
)
|
)
|
||||||
} catch {
|
} catch (error) {
|
||||||
break
|
console.error("Error", error)
|
||||||
|
console.error("title", title)
|
||||||
|
throw error
|
||||||
}
|
}
|
||||||
} while (plcontinue != null)
|
} while (plcontinue != null)
|
||||||
|
|
||||||
@ -152,27 +152,43 @@ export interface GetDeepWikipediaPageInternalLinksInput {
|
|||||||
|
|
||||||
export const getDeepWikipediaPageInternalLinks = async (
|
export const getDeepWikipediaPageInternalLinks = async (
|
||||||
input: GetDeepWikipediaPageInternalLinksInput,
|
input: GetDeepWikipediaPageInternalLinksInput,
|
||||||
): Promise<WikipediaPagesInternalLinks> => {
|
): Promise<void> => {
|
||||||
const pagesTitles = Object.keys(input.data)
|
const pagesTitles = Object.keys(input.data)
|
||||||
await Promise.all(
|
for (const pageTitle of pagesTitles) {
|
||||||
pagesTitles.map(async (pageTitle) => {
|
|
||||||
const links = input.data[pageTitle]?.links ?? []
|
const links = input.data[pageTitle]?.links ?? []
|
||||||
await Promise.all(
|
for (const pageTitleLink of links) {
|
||||||
links.map(async (pageTitleLink) => {
|
|
||||||
if (pageTitleLink in input.data) {
|
if (pageTitleLink in input.data) {
|
||||||
return
|
continue
|
||||||
}
|
}
|
||||||
input.data[pageTitleLink] = await getWikipediaPageInternalLinks({
|
input.data[pageTitleLink] = await getWikipediaPageInternalLinks({
|
||||||
locale: input.locale,
|
locale: input.locale,
|
||||||
title: pageTitleLink,
|
title: pageTitleLink,
|
||||||
})
|
})
|
||||||
await getDeepWikipediaPageInternalLinks({
|
// await getDeepWikipediaPageInternalLinks({
|
||||||
locale: input.locale,
|
// locale: input.locale,
|
||||||
data: input.data,
|
// data: input.data,
|
||||||
})
|
// })
|
||||||
}),
|
}
|
||||||
)
|
}
|
||||||
}),
|
|
||||||
)
|
// await Promise.all(
|
||||||
return input.data
|
// pagesTitles.map(async (pageTitle) => {
|
||||||
|
// const links = input.data[pageTitle]?.links ?? []
|
||||||
|
// await Promise.all(
|
||||||
|
// links.map(async (pageTitleLink) => {
|
||||||
|
// if (pageTitleLink in input.data) {
|
||||||
|
// return
|
||||||
|
// }
|
||||||
|
// input.data[pageTitleLink] = await getWikipediaPageInternalLinks({
|
||||||
|
// locale: input.locale,
|
||||||
|
// title: pageTitleLink,
|
||||||
|
// })
|
||||||
|
// await getDeepWikipediaPageInternalLinks({
|
||||||
|
// locale: input.locale,
|
||||||
|
// data: input.data,
|
||||||
|
// })
|
||||||
|
// }),
|
||||||
|
// )
|
||||||
|
// }),
|
||||||
|
// )
|
||||||
}
|
}
|
||||||
|
@ -111,6 +111,9 @@ catalogs:
|
|||||||
http-server:
|
http-server:
|
||||||
specifier: 14.1.1
|
specifier: 14.1.1
|
||||||
version: 14.1.1
|
version: 14.1.1
|
||||||
|
ky:
|
||||||
|
specifier: 1.5.0
|
||||||
|
version: 1.5.0
|
||||||
next:
|
next:
|
||||||
specifier: 14.2.5
|
specifier: 14.2.5
|
||||||
version: 14.2.5
|
version: 14.2.5
|
||||||
@ -626,6 +629,9 @@ importers:
|
|||||||
'@repo/ui':
|
'@repo/ui':
|
||||||
specifier: workspace:*
|
specifier: workspace:*
|
||||||
version: link:../ui
|
version: link:../ui
|
||||||
|
ky:
|
||||||
|
specifier: 'catalog:'
|
||||||
|
version: 1.5.0
|
||||||
next:
|
next:
|
||||||
specifier: 'catalog:'
|
specifier: 'catalog:'
|
||||||
version: 14.2.5(@babel/core@7.24.9)(@playwright/test@1.45.3)(react-dom@18.3.1(react@18.3.1))(react@18.3.1)
|
version: 14.2.5(@babel/core@7.24.9)(@playwright/test@1.45.3)(react-dom@18.3.1(react@18.3.1))(react@18.3.1)
|
||||||
@ -5448,6 +5454,10 @@ packages:
|
|||||||
resolution: {integrity: sha512-dhG34DXATL5hSxJbIexCft8FChFXtmskoZYnoPWjXQuebWYCNkVeV3KkGegCK9CP1oswI/vQibS2GY7Em/sJJA==}
|
resolution: {integrity: sha512-dhG34DXATL5hSxJbIexCft8FChFXtmskoZYnoPWjXQuebWYCNkVeV3KkGegCK9CP1oswI/vQibS2GY7Em/sJJA==}
|
||||||
engines: {node: '>= 8'}
|
engines: {node: '>= 8'}
|
||||||
|
|
||||||
|
ky@1.5.0:
|
||||||
|
resolution: {integrity: sha512-bkQo+UqryW6Zmo/DsixYZE4Z9t2mzvNMhceyIhuMuInb3knm5Q+GNGMKveydJAj+Z6piN1SwI6eR/V0G+Z0BtA==}
|
||||||
|
engines: {node: '>=18'}
|
||||||
|
|
||||||
language-subtag-registry@0.3.23:
|
language-subtag-registry@0.3.23:
|
||||||
resolution: {integrity: sha512-0K65Lea881pHotoGEa5gDlMxt3pctLi2RplBb7Ezh4rRdLEOtgi7n4EwK9lamnUCkKBqaeKRVebTq6BAxSkpXQ==}
|
resolution: {integrity: sha512-0K65Lea881pHotoGEa5gDlMxt3pctLi2RplBb7Ezh4rRdLEOtgi7n4EwK9lamnUCkKBqaeKRVebTq6BAxSkpXQ==}
|
||||||
|
|
||||||
@ -14005,6 +14015,8 @@ snapshots:
|
|||||||
|
|
||||||
klona@2.0.6: {}
|
klona@2.0.6: {}
|
||||||
|
|
||||||
|
ky@1.5.0: {}
|
||||||
|
|
||||||
language-subtag-registry@0.3.23: {}
|
language-subtag-registry@0.3.23: {}
|
||||||
|
|
||||||
language-tags@1.0.9:
|
language-tags@1.0.9:
|
||||||
|
@ -5,6 +5,7 @@ packages:
|
|||||||
catalog:
|
catalog:
|
||||||
# Utils
|
# Utils
|
||||||
"deepmerge": "4.3.1"
|
"deepmerge": "4.3.1"
|
||||||
|
"ky": "1.5.0"
|
||||||
|
|
||||||
# React.js/Next.js
|
# React.js/Next.js
|
||||||
"next": "14.2.5"
|
"next": "14.2.5"
|
||||||
|
Reference in New Issue
Block a user