Hexinverter ¬ I ported my website to Pixywerk2 from Wordpress

This short post is about my website compiler Pixywerk (Gitea project / Github Mirror)

So I used to host my website on Wordpress, but I am migrating away from that, in favor of a more privacy-sensitive platform which is my own static web hosting space. As part of that, I also am working on Pixywerk2, which is a metadata-based static website compiler.

So an interesting part of this is that you can export the contents of your Wordpress (at least posts, pages, and attachments) into an XML file. So I wrote a little script to process it into a Pixywerk2 folder (which I'll be including in Pixywerk2 distribution).

Here's the code!

import argparse
import datetime
import json
import os
import sys
from urllib.parse import urlparse
from xml.etree.ElementTree import ElementTree

import requests

FILE_PATTERN = "{postdate}-{postname}.thtml"


def parse_args(args):
    parser = argparse.ArgumentParser("importwp.py")

    parser.add_argument("input", help="The input file.")
    parser.add_argument("out_dir", help="Output root directory.", default='.')
    parser.add_argument("--fetch-attachments", help="Fetch all attachments referred to in file.", action="store_true", dest='fetch_attachments')
    parser.add_argument("--attachment-dir", help="Subdirectory to place attachments in.", default="attachments", dest='attachment_dir')
    parser.add_argument("--post-dir", help="Subdirectory to place posts in.", default="posts", dest='post_dir')
    parser.add_argument("--page-dir", help="Subdirectory to place pages in.", default="", dest='page_dir')

    result = parser.parse_args(args)
    result.post_dir = os.path.join(result.out_dir, result.post_dir)
    result.page_dir = os.path.join(result.out_dir, result.page_dir)
    result.attachment_dir = os.path.join(result.out_dir, result.attachment_dir)

    return result


def parse_input(xmlpath):
    tree = ElementTree()

    tree_root = tree.parse(source=xmlpath)
    posts = {}
    attachments = {}
    pages = {}

    for node in tree_root.find("channel"):
        if node.tag == "item":
            post_type = node.find("{http://wordpress.org/export/1.2/}post_type")
            if post_type is not None:
                status = node.find("{http://wordpress.org/export/1.2/}status")
                if status is not None and status.text == "draft":
                    continue
                content = node.find("{http://purl.org/rss/1.0/modules/content/}encoded")
                title = node.find("title")
                pubdate = node.find("pubDate")
                description = node.find("description")
                post_name = node.find("{http://wordpress.org/export/1.2/}post_name")
                categories = node.findall("category")
                post_id = node.find("{http://wordpress.org/export/1.2/}post_id")
                post_parent = node.find("{http://wordpress.org/export/1.2/}post_parent")
                if post_type.text == "post":
                    # found a post!
                    posts[post_id.text] = {'content':content,
                                           'title':title,
                                           'pubdate':pubdate,
                                           'description':description,
                                           'post_name':post_name,
                                           'categories':categories,
                                           'post_parent':post_parent}
                elif post_type.text == "attachment":
                    # attachment
                    att_url = node.find("{http://wordpress.org/export/1.2/}attachment_url")

                    attachments[post_id.text] = {'content':content,
                                                 'title':title,
                                                 'pubdate':pubdate,
                                                 'description':description,
                                                 'post_name':post_name,
                                                 'categories':categories,
                                                 'post_parent':post_parent,
                                                 'att_url':att_url,}
                elif post_type.text == "page":
                    pages[post_id.text] = {'content':content,
                                           'title':title,
                                           'pubdate':pubdate,
                                           'description':description,
                                           'post_name':post_name,
                                           'categories':categories,
                                           'post_parent':post_parent}

    return posts, attachments, pages

def fetch_attachment(attch, outdir):
    url = attch['att_url'].text
    p = urlparse(url)
    filename = os.path.join(outdir, os.path.split(p.path)[-1])
    print("fetching attachment",url,"->",filename)
    r = requests.get(url)
    with open(filename, 'wb') as outf:
        outf.write(r.content)

def save_cont(post, outdir):
    dt = datetime.datetime.strptime(post['pubdate'].text, "%a,  %d %b %Y %H:%M:%S %z")
    postdate = dt.strftime("%Y-%m-%d-%H%M%S")
    filename = FILE_PATTERN.format(postdate=postdate, postname=post['post_name'].text)
    print(post['title'].text, "->", filename)
    with open(os.path.join(outdir, filename), "w") as outf:
        outf.write(post['content'].text)
        # handle attachments

        tags = []
        category = ""
        for tg in post['categories']:
            if "domain" in tg.attrib and tg.attrib["domain"] == "category":
                category = tg.text
            else:
                tags.append(tg.text)

    with open(os.path.join(outdir, filename + ".meta"), "w") as outf:
        metadata = {
            "title": post['title'].text,
            "description": post['description'].text,
            "post_time": dt.timestamp(),
            "featured": "",
            "tags": tags,
            "category": category,
        }
        json.dump(metadata, outf)


def main():
    args = parse_args(sys.argv[1:])
    try:
        os.mkdir(args.out_dir)
    except FileExistsError:
        pass

    try:
        os.mkdir(args.page_dir)
    except FileExistsError:
        pass

    try:
        os.mkdir(args.post_dir)
    except FileExistsError:
        pass

    if args.fetch_attachments:
        try:
            os.mkdir(args.attachment_dir)
        except FileExistsError:
            pass

    posts, attachments, pages = parse_input(args.input)

    if args.fetch_attachments:
        [fetch_attachment(post, args.attachment_dir) for post in attachments.values()]

    [save_cont(post, args.post_dir) for post in posts.values()]
    [save_cont(page, args.page_dir) for page in pages.values()]

    return 0


if __name__ == "__main__":
    sys.exit(main())

An interesting experience! It's not perfect but it saves a lot of rewriting. To do: Create paragraphs where there were implicit ones, and possibly add code markup.