import fs from "node:fs"
import path from "node:path"
import {
  extractRowsFromSQLValues,
  swapKeysAndValues,
  zeroPad,
} from "./utils.js"

const SQL_DUMP_PATH = path.join(process.cwd(), "dump")
const SQL_FILENAME_NUMBER_PAD = 4

/**
 * @typedef {Record<string, number>} WikipediaPagesKeyTitle
 *
 * Object to store pages from Wikipedia:
 * - Key: page title sanitized - The real title shown is this title with underscores (_) converted to spaces ( ).
 * - Value: page id.
 */

/**
 * @typedef {Record<string, number>} WikipediaPagesKeyId
 *
 * Object to store pages from Wikipedia:
 * - Key: page id.
 * - Value: page title sanitized - The real title shown is this title with underscores (_) converted to spaces ( ).
 */

/**
 * @typedef WikipediaInternalLink
 * @property {number} fromPageId
 * @property {number} toPageId
 */

/**
 * Function to clean the `page.sql` file by:
 * - Removing all lines that don't start with `INSERT INTO...`.
 * - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to false (0).
 * - Only keep columns `page_id` (1st column) and `page_title` (3rd column).
 * @returns {Promise<WikipediaPagesKeyId>}
 */
const cleanPagesSQL = async () => {
  /** @type {WikipediaPagesKeyId} */
  const wikipediaPagesKeyId = {}

  const INSERT_INTO_START_INPUT = "INSERT INTO `page` VALUES "
  const sqlInputPath = path.join(SQL_DUMP_PATH, "page.sql")
  const sqlInputStat = await fs.promises.stat(sqlInputPath)
  const sqlInputFileStream = fs.createReadStream(sqlInputPath, "utf-8")

  let isInsideInsert = false
  let current = ""
  let lastPercent = 0

  let pagesFileCount = 1

  const INSERT_INTO_START_OUTPUT = "INSERT INTO pages (id, title) VALUES "

  const BATCH_SIZE = 1_000_000

  /**
   * @type {string[]}
   */
  let batch = []

  const flushBatch = async () => {
    if (batch.length > 0) {
      const batchString = batch.join(",")
      const fileName = `${zeroPad(pagesFileCount, SQL_FILENAME_NUMBER_PAD)}-pages-inserts.sql`
      const sqlOutputPath = path.join(
        process.cwd(),
        "sql-pages-inserts",
        fileName,
      )
      await fs.promises.writeFile(
        sqlOutputPath,
        `${INSERT_INTO_START_OUTPUT}${batchString};`,
        {
          encoding: "utf-8",
        },
      )
      console.log(`flushBatch - ${fileName}, batch.length: ${batch.length}`)
      pagesFileCount += 1
      batch = []
    }
  }

  return await new Promise((resolve, reject) => {
    sqlInputFileStream
      .on("data", async (dataInput) => {
        const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size
        const bytesReadPercent = bytesReadRatio * 100

        if (bytesReadPercent - lastPercent >= 1) {
          console.log(
            `cleanPagesSQL - Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`,
          )
          lastPercent = bytesReadPercent
        }

        let data = current + dataInput

        if (!isInsideInsert) {
          const lines = data.split("\n").filter((line) => {
            return line.startsWith(INSERT_INTO_START_INPUT)
          })
          const [line] = lines
          if (line == null) {
            sqlInputFileStream.close()
            return reject(new Error(`No "${INSERT_INTO_START_INPUT}" found.`))
          }
          isInsideInsert = true
          const lineStripped = line.slice(INSERT_INTO_START_INPUT.length)
          data = lineStripped
        }

        const { rows, unCompleted } = extractRowsFromSQLValues(data)
        current = unCompleted

        for (const row of rows) {
          if (row.length !== 12) {
            sqlInputFileStream.close()
            console.error([row])
            return reject(new Error(`Invalid Row values.`))
          }

          const id = Number.parseInt(row[0] ?? "0", 10)
          const namespace = row[1] ?? ""
          const title = row[2] ?? ""
          const isRedirect = row[3] === "1"

          if (namespace === "0" && !isRedirect) {
            wikipediaPagesKeyId[id] = title
            batch.push(`(${id},E${title})`)
          }
        }

        if (batch.length >= BATCH_SIZE) {
          sqlInputFileStream.pause()
          await flushBatch()
          sqlInputFileStream.resume()
        }
      })
      .on("error", (error) => {
        return reject(error)
      })
      .on("close", async () => {
        await flushBatch()
        console.log("cleanPagesSQL - Bytes read (100%).")
        return resolve(wikipediaPagesKeyId)
      })
  })
}

const wikipediaPagesKeyId = await cleanPagesSQL()

/**
 * Function to clean the `pagelinks.sql` file by:
 * - Removing all lines that don't start with `INSERT INTO...`.
 * - Filter by keeping rows where `pl_from_namespace` (2nd column) is equal to 0.
 * - Transform the rows to internal links with fromPageId and toPageId.
 * @returns {Promise<void>}
 */
const cleanInternalLinksSQL = async () => {
  let internalLinksFileCount = 1
  const INSERT_INTO_START_OUTPUT =
    "INSERT INTO internal_links (from_page_id, to_page_id) VALUES "

  /**
   * @type {WikipediaPagesKeyTitle}
   */
  const wikipediaPagesKeyTitle = swapKeysAndValues(wikipediaPagesKeyId)

  const INSERT_INTO_START_INPUT = "INSERT INTO `pagelinks` VALUES "
  const sqlInputPath = path.join(SQL_DUMP_PATH, "pagelinks.sql")
  const sqlInputStat = await fs.promises.stat(sqlInputPath)
  const sqlInputFileStream = fs.createReadStream(sqlInputPath, "utf-8")

  let isInsideInsert = false
  let current = ""
  let lastPercent = 0

  const BATCH_SIZE = 4_000_000

  /**
   * @type {string[]}
   */
  let batch = []

  const flushBatch = async () => {
    if (batch.length > 0) {
      const batchString = batch.join(",")
      const fileName = `${zeroPad(internalLinksFileCount, SQL_FILENAME_NUMBER_PAD)}-internal-links-inserts.sql`
      const sqlOutputPath = path.join(
        process.cwd(),
        "sql-internal-links-inserts",
        fileName,
      )
      await fs.promises.writeFile(
        sqlOutputPath,
        `${INSERT_INTO_START_OUTPUT}${batchString};`,
        {
          encoding: "utf-8",
        },
      )
      console.log(`flushBatch - ${fileName}, batch.length: ${batch.length}`)
      internalLinksFileCount += 1
      batch = []
    }
  }

  return await new Promise((resolve, reject) => {
    sqlInputFileStream
      .on("data", async (dataInput) => {
        const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size
        const bytesReadPercent = bytesReadRatio * 100

        if (bytesReadPercent - lastPercent >= 0.5) {
          console.log(
            `cleanInternalLinksSQL - Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`,
          )
          lastPercent = bytesReadPercent
        }

        let data = current + dataInput

        if (!isInsideInsert) {
          const lines = data.split("\n").filter((line) => {
            return line.startsWith(INSERT_INTO_START_INPUT)
          })
          const [line] = lines
          if (line == null) {
            sqlInputFileStream.close()
            return reject(new Error(`No "${INSERT_INTO_START_INPUT}" found.`))
          }
          isInsideInsert = true
          const lineStripped = line.slice(INSERT_INTO_START_INPUT.length)
          data = lineStripped
        }

        const { rows, unCompleted } = extractRowsFromSQLValues(data)
        current = unCompleted

        for (const row of rows) {
          if (row.length !== 5) {
            sqlInputFileStream.close()
            console.error([row])
            return reject(new Error(`Invalid Row values.`))
          }

          const plFromPageId = Number.parseInt(row[0] ?? "0", 10)
          const plTargetNamespace = row[1] ?? ""
          const plTargetTitle = row[2] ?? ""
          const plFromNamespace = row[3] ?? ""

          if (plFromNamespace === "0" && plTargetNamespace === "0") {
            const toPageId = wikipediaPagesKeyTitle[plTargetTitle]
            if (toPageId != null && wikipediaPagesKeyId[plFromPageId] != null) {
              /**
               * @type {WikipediaInternalLink}
               */
              const wikipediaInternalLink = {
                fromPageId: plFromPageId,
                toPageId,
              }
              batch.push(
                `(${wikipediaInternalLink.fromPageId},${wikipediaInternalLink.toPageId})`,
              )
            }
          }
        }

        if (batch.length >= BATCH_SIZE) {
          sqlInputFileStream.pause()
          await flushBatch()
          sqlInputFileStream.resume()
        }
      })
      .on("error", (error) => {
        return reject(error)
      })
      .on("close", async () => {
        await flushBatch()
        console.log("cleanInternalLinksSQL - Bytes read (100%).")
        return resolve()
      })
  })
}

await cleanInternalLinksSQL()