personal/q3k: cleanup script for b/32 (owncloud postgres)

Change-Id: I7a330d460763d99bfbd736cecae33c0df7a41aae
diff --git a/personal/q3k/b/32/cleanup.py b/personal/q3k/b/32/cleanup.py
new file mode 100644
index 0000000..3ded775
--- /dev/null
+++ b/personal/q3k/b/32/cleanup.py
@@ -0,0 +1,150 @@
+# Script to attempt to clean up our owncloud database (b/32) after The Postgres
+# Fuckup (b/30).
+#
+# Think of it as a one-shot fsck, documented in the form of the code that q3k@
+# used to recover from this kerfuffle.
+#
+# SECURITY: It's full of manual SQL query crafting without parametrization.
+# Don't attempt to use it for anything else other than this one-shot usecase.
+#
+# You will need to tunnel to the postgreses running on Boston:
+#    $ ssh \
+#        -L15432:127.0.0.1:5432 \
+#        -L15433:127.0.0.1:5433 \
+#        hackerspace.pl
+
+from datetime import datetime
+import os
+
+import psycopg2
+
+
+incident_start = 1611529200 # when pg12 started to run
+incident_end = 1611788400 # when we rolled back to pg9
+
+
+OWNCLOUD_PASSWORD = os.environ.get("OWNCLOUD_PASSWORD").strip()
+if not OWNCLOUD_PASSWORD:
+    # Get it from boston, /var/www/owncloud/config/config.php.
+    raise Exception("OWNCLOUD_PASSWORD must be set to owncloud postgres password")
+
+
+conn9 = psycopg2.connect(host="localhost", port=15432, user="owncloud", password=OWNCLOUD_PASSWORD, dbname="owncloud")
+conn12 = psycopg2.connect(host="localhost", port=15433, user="owncloud", password=OWNCLOUD_PASSWORD, dbname="owncloud")
+
+
+def idset(conn, table, keyname="id"):
+    """Return a set of IDs from a given table, one per row."""
+    cur = conn.cursor()
+    cur.execute(f"SELECT {keyname} FROM oc_{table}")
+    res = cur.fetchall()
+    cur.close()
+    return set([r[0] for r in res])
+
+
+def valset(conn, table, keys):
+    """Return a set of concatenated values for the given keys in a table, one per row."""
+    keynames = ", ".join(keys)
+    cur = conn.cursor()
+    cur.execute(f"SELECT {keynames} FROM oc_{table}")
+    res = cur.fetchall()
+    cur.close()
+    res = [';;;'.join([str(elem) for elem in r]) for r in res]
+    return set(res)
+
+
+# Check accounts difference.
+#
+# RESULT: Thankfully, no accounts have been accidentally roled back.
+accounts12 = idset(conn12, "accounts", keyname="uid")
+accounts9 = idset(conn9, "accounts", keyname="uid")
+print("Accounts missing in 9:", accounts12 - accounts9)
+assert (accounts12 - accounts9) == set()
+
+
+def account_by_uid(conn, uid):
+    """Return SSO UID for a given Owncloud UID."""
+    cur = conn.cursor()
+    cur.execute(f"SELECT ldap_dn FROM oc_ldap_user_mapping WHERE owncloud_name = '{uid}'")
+    dn, = cur.fetchone()
+    cur.close()
+    part = dn.split(',')[0]
+    assert part.startswith('uid=')
+    return part[4:]
+
+
+def storage_owner_by_id(conn, id_):
+    """Return SSO UID for a given storage numerical ID."""
+    cur = conn.cursor()
+    cur.execute(f"SELECT id FROM oc_storages WHERE numeric_id = '{id_}'")
+    oid, = cur.fetchone()
+    cur.close()
+    if oid == 'object::store:amazon::nextcloud':
+        return "S3"
+    assert oid.startswith('object::user:')
+    userid = oid[13:]
+    assert len(userid) > 0
+    if userid == "gallery":
+        return "GALLERY"
+    return account_by_uid(conn, userid)
+
+
+# Check shares table. This table contains the intent of sharing some file with someone else.
+#
+# RESULT: we only have things that have been removed after rollback to PG9,
+# nothing was created in PG12 and lost.
+shareids12 = idset(conn12, "share")
+shareids9 = idset(conn9, "share")
+print("Shares missing in 9:", len(shareids12 - shareids9))
+cur12 = conn12.cursor()
+for id_ in list(shareids12-shareids9):
+    cur12.execute(f"SELECT uid_owner, file_target, stime, share_with FROM oc_share WHERE id = {id_}")
+    uid_owner, file_target, stime, share_with = cur12.fetchone()
+    account = account_by_uid(conn12, uid_owner)
+    stime_human = datetime.utcfromtimestamp(stime).strftime('%Y-%m-%d %H:%M:%S')
+    print(f"Missing share {id_} {file_target} owned by {account}..")
+    if stime < incident_start or stime > incident_end:
+        print(f"  Skipping, created at {stime_human}")
+        continue
+    raise Exception("Unhandled.")
+cur12.close()
+
+
+# Check mounts table. This contains root file storages for each user, but also
+# incoming shares 'mounted' into a user's account.
+# From what I cen tell, storage_id/root_id are the source path that's being
+# mounted (root_id being the fileid inside an oc_filecache, and storage_id
+# being the storage in which that file is kept), while user_id/mount_point are
+# the mount destination (ie. path into which this is mounted for a user's
+# view).
+#
+# RESULT: we only have share-mounts missing for a handful of users. We choose
+# to ignore it, as we assume next time these users log in they will get the
+# mounts again.
+# TODO(q3k): verify this
+mounts12 = valset(conn12, "mounts", ["storage_id", "root_id", "user_id", "mount_point"])
+mounts9 = valset(conn9, "mounts", ["storage_id", "root_id", "user_id", "mount_point"])
+print("Mounts missing in 9:", len(mounts12 - mounts9))
+# Mounts that appearify normally whenever you log into owncloud, as they are the result of shares':
+mount_names_ok = set(["2020-03-26_covid_templar", "camera", "Public Shaming", "przylbice.md", "Test.txt", "covid"])
+# Mounts that used to be from a share that existed, but has been since deleted in PG9.
+mount_names_ok |= set(["Covid-instrukcje", "Chaos_modele_covid", "Covid_proces_presspack"])
+mounts_sorted = []
+for m in list(mounts12 - mounts9):
+    storage_id, root_id, user_id, mount_point = m.split(';;;')
+    mounts_sorted.append((storage_id, root_id, user_id, mount_point))
+mounts_sorted = sorted(mounts_sorted, key=lambda el: el[2])
+for storage_id, root_id, user_id, mount_point in mounts_sorted:
+    assert mount_point.startswith("/" + user_id + "/")
+    mount_point = mount_point[len(user_id)+1:]
+    account = account_by_uid(conn12, user_id)
+    print(f"Missing mount {mount_point}, storage ID {storage_id}, owned by {account}..")
+    storage_owner = storage_owner_by_id(conn12, storage_id)
+    print(f"  Storage owner: {storage_owner}")
+
+    parts = mount_point.split('/')
+    if len(parts) == 4 and parts[0] == '' and parts[1] == 'files' and parts[2] in mount_names_ok and parts[3] == '':
+        print("  Skipping, known okay")
+        continue
+    raise Exception("Unhandled")
+