"""
one time use code to convert an SIPB's IkiWiki website content (mdwn files)
to a Hugo website (md files)

it is far fron an end-all-be-all code to convert IkiWiki to Hugo,
since it is not built to handle everything in IkiWiki, only what
SIPB happened to use
"""

REPO_DIR = "/home/rgabriel/Projects/SIPB/Website/Old/wiki"
OUTPUT_DIR = "/home/rgabriel/Projects/SIPB/Website/New/website/converted"
ARCHIVE_DIR = "archive"

import os
import sys
import re
import subprocess
from datetime import datetime
import toml
import shutil
from talk import summarize
from time import sleep

class HugoPage:
    """
    a Hugo .md content page
    """

    title: str = None
    created: datetime = None
    modified: datetime = None
    body: str = None  # as markdown
    url: str = None # main URL
    aliases: list[str] = None  # other URLs that redirect here
    summary: str = None

    def __init__(self):
        # initialize, otherwise we have aliasing problems (6.101......)
        self.aliases = []

    def get_front_matter(self):
        """
        gets a TOML string corresponding to its front matter
        """
        # if values are None, they are automatically ommitted by `toml`
        return toml.dumps(
            {
                "title": self.title,
                "date": self.created,
                "lastmod": self.modified,
                # if there are no aliases, don't print them
                "url": self.url,
                "aliases": self.aliases if self.aliases else None,
                "summary": self.summary,
            }
        )

    def get_contents(self):
        """
        gets the content of this file
        """
        return f"+++\n{self.get_front_matter()}+++\n\n{self.body}\n"


class Redirect:
    """
    a "page" that only redirects to another page. use as an intermediate
    step during the conversion, since it is not supposed to exist as a .md
    file and instead only exists in "aliases"
    """

    destination: str  # the path it points to

    def get_contents(self):
        print(f"Warning: creating fake redirect to {self.destination}", file=sys.stderr)
        return f"Please go to [{self.destination}]({convert_link(self.destination)})"

    def __init__(self, destination):
        self.destination = destination


def extract_title(s):
    expr = re.compile(r'\[\[!meta title="(.+)"\]\]')
    return re.search(expr, s)[1]


def extract_redir(s):
    # note! no double quotes here!
    expr = re.compile(r"\[\[!meta redir=(.+)\]\]")
    return re.search(expr, s)[1]


def convert_link(link: str):
    """
    converts a wikilink destination to an absolute path
    (relative to the / of the website)
    """
    # TODO: check existence

    # convert to absolute link
    if not link.startswith('/'):
        link = '/' + link

    # convert index wikilink
    if link == '/index':
        link = '/'

    # what if we were given a relative link? we should uhhh idk
    # TODO: relref
    return link


def ikiwiki_to_hugo(md) -> HugoPage | Redirect:
    page = HugoPage()

    # get title
    md = md.split("\n")
    if "!meta title" in md[0]:
        page.title = extract_title(md[0])
        md = md[1:]
    if "!meta redir" in md[0]:
        return Redirect(extract_redir(md[0]))
    md = "\n".join(md)

    # replace ikiwiki toc with hugo shortcode
    md = md.replace('[[!toc]]', '{{< toc >}}')

    # replace full wikilinks with []() syntax
    # (*? instead of * does a lazy search)
    wikilink = re.compile(r"\[\[([^\!].*?)\|(.+?)\]\]")
    while match := re.search(wikilink, md):
        correct = f"[{match[1]}]({convert_link(match[2])})"  # in actual markdown syntax
        md = re.sub(wikilink, correct, md, count=1)

    # replace abbreviated wikilinks with <> syntax
    # this is greedier than the last one but we can assume we already replaced all of those
    abbr_wikilink = re.compile(r"\[\[([^\!].*?)\]\]")
    while match := re.search(abbr_wikilink, md):
        # hugo doesn't have that syntax, at least with the default markdown engine
        # correct = f"<{match[1]}>"
        correct = f"[{match[1]}]({convert_link(match[1])})"
        md = re.sub(abbr_wikilink, correct, md, count=1)

    page.body = md

    return page


def get_all_mdwn_files(dir=REPO_DIR, prefix=""):
    """
    recursively find all *.mdwn files and return a list of them
    """
    ans = []
    for file in os.listdir(dir):
        path = os.path.join(dir, file)
        if os.path.isdir(path):
            ans.extend(get_all_mdwn_files(path, prefix=prefix + file + "/"))
        elif file.endswith(".mdwn"):
            # i remember i think 6.102 used flatMap for something like this ._.
            ans.append(prefix + file)
        else:
            print(f'Ignoring non-mdwn file {prefix + file}')
    return ans


files = get_all_mdwn_files()
website: dict[str, HugoPage | Redirect] = {}


def convert_filename(filename, all_files):
    """
    convert an ikiwiki path to a hugo path (without trailing .md)
    """
    # strip .mdwn extension
    filename = filename[:-5]

    # replace index with _index
    if filename == "index":
        filename = "_index"

    # if the parent directory exists, use _index.md instead
    if filename in {os.path.dirname(f) for f in all_files}:
        filename = os.path.join(filename, "_index")

    return filename


def get_dates(file):
    """
    get the creation date of a file (i.e. first commit in which it appears)
    and the last modified date of a file as a tuple, in that order
    """
    # https://stackoverflow.com/questions/2390199/finding-the-date-time-a-file-was-first-added-to-a-git-repository
    
    log = subprocess.check_output(
        ['git', 'log', '--follow', '--format=%ad', '--date', 'unix', file]
    ).decode().split('\n')
    log = [line for line in log if line]
    oldest_timestamp = int(log[-1])
    # normally, this would be sourced from git, but we are copying over all of these files
    # so that does not work here (unless we copy the files WITH their history - but nah pls no)
    newest_timestamp = int(log[0])
    return datetime.fromtimestamp(oldest_timestamp), datetime.fromtimestamp(newest_timestamp)


def initial_convert_all_pages():
    """
    convert all pages from mdwn to md, populating
    the `website` global dict with relative paths as keys
    and either a page or redirect as values
    """
    for file in files:
        with open(os.path.join(REPO_DIR, file), "r") as f:
            mdwn = f.read()
        # remove the mdwn extension
        filename = convert_filename(file, files)
        website[filename] = ikiwiki_to_hugo(mdwn)
        # get dates from git
        created, modified = get_dates(file)
        website[filename].created = created
        website[filename].modified = modified
        

def save_to_disk():
    """
    saves the content of the `website` global to disk
    """

    # recreate the output directory
    if os.path.isdir(OUTPUT_DIR):
        shutil.rmtree(OUTPUT_DIR)
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    for file, page in website.items():
        # make parent directories
        os.makedirs(os.path.join(OUTPUT_DIR, os.path.dirname(file)), exist_ok=True)

        # actually write file contents
        with open(os.path.join(OUTPUT_DIR, file + ".md"), "w") as f:
            f.write(page.get_contents())


def fix_alias(file):
    """
    given a file path without .md extension,
    convert it to a format that hugo likes for aliases
    (i.e. no index and leading slash)
    """
    alias = file
    # we don't want trailing _index
    if os.path.basename(alias) == '_index':
        alias = os.path.dirname(alias)
    # alias path should start with /
    if not alias.startswith('/'):
        alias = '/' + alias
    assert alias != '/'
    return alias


def fix_redirects():
    """
    delete all `Redirect`s from `website`, and add them to `aliases` instead
    """
    redirects = [(file, page) for file, page in website.items() if type(page) == Redirect]
    for file, page in redirects:
        del website[file]
        dest = page.destination
        if dest.startswith('/'):
            dest = dest[1:]
        file = fix_alias(file)
        # try adding _index to the destination
        if dest not in website:
            dest = os.path.join(dest, '_index')
        # add alias :)
        if dest in website:
            website[dest].aliases.append(file)
        else:
            print(f'Warning: Redirect {file}->{dest} could not be found')


def add_warning_to_all_pages():
    """
    adds the conversion warning to all pages
    """
    for page in website.values():
        assert type(page) == HugoPage, 'Please run this function after fix_redirects()'
        page.body = "{{< migration-warning >}}\n\n" + page.body


def oldify_everything():
    """
    move every page under old/ and make aliases to everything
    """
    global website # ugh, python being python
    # add an alias to itself for every page
    for file, page in website.items():
        # hugo doesn't let us redirect from /, understandably
        if file != '_index':
            page.url = fix_alias(file)
    # prepend old to all the paths
    website = {os.path.join(ARCHIVE_DIR, file): page for file,page in website.items()}


def wipe_summaries():
    """
    set the summary of all pages to a LLM-generated summary
    """
    for file, page in website.items():
        if type(page) == HugoPage:
            page.summary = summarize(page.title, page.body)
            sleep(0.5)


if __name__ == "__main__":
    # we will get the timestamps from git
    # git requires us to be in the repo to run any commands
    os.chdir(REPO_DIR)

    # first, we go through all mdwn files and convert them to an internal representation
    # of either a Hugo page or a redirect
    initial_convert_all_pages()

    # next we nuke all "redirects" and set the aliases field appropriately
    fix_redirects()

    # optionally we can add a warning about migration to all pages
    add_warning_to_all_pages()

    # optionally we can add an alias to every page so that they can all be on an `old/`
    # subdirectory without breaking links
    oldify_everything()

    # we don't want the page summary to be the warning
    wipe_summaries()

    # finally, we save into the output directory
    save_to_disk()
