#!/usr/bin/env python3

import html
import json
import re
from datetime import datetime

WALLABAG_EXPORT = "Wallabag All articles.json"
OUTPUT_FILE = "walla2goodlinks.json"

# https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
CLEANR = re.compile(r"<.*?>")

with open(WALLABAG_EXPORT, "rt") as f:
    json_obj = json.load(f)

output_obj = []
for rec in json_obj:
    time_added = datetime.fromisoformat(rec["created_at"])
    time_read = time_added
    html_str = html.unescape(rec["content"])
    html_str = html_str.replace("\n", " ")
    html_str = re.sub(CLEANR, "", html_str)
    tags = rec["tags"]
    tags.append("+IMPORTED")
    new_obj = {
        "readAt": time_read.timestamp(),
        "addedAt": time_added.timestamp(),
        "summary": html_str[:199],
        "starred": (rec["is_starred"] == 1),
        "title": rec["title"],
        "tags": tags,
        "url": rec["url"],
    }
    print(repr(rec))
    print(repr(new_obj))
    output_obj.append(new_obj)

with open(OUTPUT_FILE, "w") as f:
    json.dump(output_obj, f)