lightningd · SimonVrouwe · Jul 22, 2019 · Jul 22, 2019 · Jul 27, 2019 · Jul 27, 2019
diff --git a/dbbackup/README.md b/dbbackup/README.md
@@ -0,0 +1,41 @@
+## Summary plugin
+
+This plugin keeps a synchronized backup of c-lightning's (CL) sqlite3 database.
+It uses the db_write hook so that every commit (write) to CL's database is first
+written to the backup. This allows recovery of any committed-to channel state,
+including HTLCs. This plugin does not backup the seed and is not a complete
+node-backup.
+
+## Installation
+
+For general plugin installation instructions see the repos main
+[README.md](https://github.com/lightningd/plugins/blob/master/README.md#Installation)
+
+## Options:
+
+* `--db-backup-file`: path of the backup file
+
+## Usage
+
+If the given `db-backup-file` doesn't exist yet, it will be created from a
+copy of CL's database.
+
+During startup, any existing backup file is checked to match CL's current
+database. If that check fails or initialization fails for other reasons, it will
+shutdown CL and log `**BROKEN**`. If the plugin fails in writing to the backup
+file, it will trigger CL to crash.
+
+The backup file is created with rw permission for the owner, it contains
+sensitive information, so be a bit careful. When plugin complains about mismatch
+between backup and original db, please investigate what caused it before
+recovering.
+
+To recover: shutdown CL and copy the backup to `~/.lighting/lightningd.sqlite3`
+File permissions may need to be restored.
+
+## Testing
+
+The tests uses c-lightning's pytest framework. To run the tests, you can
+link or copy this repository's `/dbbackup` directory into c-lightning repo's
+`/test` directory. Then cd into the c-lighting repo directory and to run
+test_dbbackup_* tests, run: `DEVELOPER=1 py.test tests/ -s -v -k test_dbbackup_`
diff --git a/dbbackup/__init__.py b/dbbackup/__init__.py
diff --git a/dbbackup/dbbackup.py b/dbbackup/dbbackup.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+from lightning import Plugin
+import os
+import shutil
+import sqlite3
+from stat import S_IRUSR, S_IWUSR
+
+plugin = Plugin()
+plugin.sqlite_pre_init_cmds = []
+plugin.initted = False
+
+
+class NewBackupFileError(Exception):
+    def __init__(self, path, e):
+        self.message = 'Could not create db-backup-file {} : {}'.format(path, e)
+
+
+# Create fresh backup, started as a copy
+def new_backup_file(db, backup):
+    try:
+        shutil.copyfile(db, backup)
+        os.chmod(backup, S_IRUSR | S_IWUSR)     # rw permission for owner
+        plugin.log('Creating new db-backup-file: {}'.format(backup))
+    except Exception as e:
+        raise NewBackupFileError(backup, e)
+
+
+def compare_dbs(db1, db2):
+    for a, b in zip(db1.iterdump(), db2.iterdump()):
+        if a != b:
+            return False
+    return True
+
+
+@plugin.init()
+def init(configuration, options, plugin):
+    # FIXME: `==` should be changed to `is not None`, see workaround below
+    if plugin.get_option('db-backup-file') == '':
+        plugin.log('No db-backup-file specified', 'error')
+        plugin.rpc.stop()   # stop lightningd
+
+    try:
+        db = os.path.join(configuration['lightning-dir'], 'lightningd.sqlite3')
+        backup = plugin.get_option('db-backup-file')
+
+        # If backup exist, replay pre_init_cmds on a temporary copy
+        if os.path.isfile(backup):
+            plugin.log('Found existing db-backup-file: {} comparing...'.format(backup))
+            backup_copy = shutil.copy(backup, backup + '.tmp')
+            db1 = sqlite3.connect(backup_copy, isolation_level=None)
+            db2 = sqlite3.connect('file:{}?mode=ro'.format(db), uri=True)   # open in read-only
+            for c in plugin.sqlite_pre_init_cmds:
+                db1.execute(c)
+
+            # If it then matches orignal db, replace backup with copy ... else abort
+            dbs_match = compare_dbs(db1, db2)
+            db1.close()
+            db2.close()
+            if dbs_match:
+                os.rename(backup_copy, backup)
+                plugin.log("Existing db-backup-file OK and successfully synced")
+            else:
+                plugin.log("Existing db-backup-file differs from original database, i.e. applying"
+                           " pre-init statements (to a copy) didn't make it match the original db", 'error')
+                os.remove(backup_copy)
+                plugin.rpc.stop()   # stop lightningd
+
+        else:
+            new_backup_file(db, backup)
+
+        plugin.conn = sqlite3.connect(backup, isolation_level=None)
+        plugin.initted = True
+        plugin.log('Initialized')
+    except Exception as e:
+        if isinstance(e, NewBackupFileError):
+            plugin.log(e.message, 'error')
+        else:
+            plugin.log('Initialization failed: {}'.format(e), 'error')
+
+        plugin.rpc.stop()   # stop lightningd
+
+
+@plugin.hook('db_write')
+def db_write(plugin, writes):
+    if not plugin.initted:
+        plugin.sqlite_pre_init_cmds += writes
+    else:
+        for c in writes:
+            try:
+                plugin.conn.execute(c)
+            except Exception as e:
+                plugin.log('Failed to write to backup: {}, SQL-statement: {}'.format(e, c), 'error')
+                # This will `FATAL SIGNAL 6` crash lightningd, but it ensures the failed write
+                # (here) to backup is also not committed-to in the original database
+                return False
+
+    return True
+
+
+# Workaround for empty or absent option being (incorrectly?) passed as `null`
+plugin.add_option('db-backup-file', '', 'The database backup file.')
+plugin.run()
diff --git a/dbbackup/requirements.txt b/dbbackup/requirements.txt
@@ -0,0 +1 @@
+pylightning>=0.0.7.3
diff --git a/dbbackup/tests/__init__.py b/dbbackup/tests/__init__.py
diff --git a/dbbackup/tests/lightningd-v102.sqlite3 b/dbbackup/tests/lightningd-v102.sqlite3
diff --git a/dbbackup/tests/lightningd-v102.sqlite3-backup b/dbbackup/tests/lightningd-v102.sqlite3-backup
diff --git a/dbbackup/tests/test_dbbackup.py b/dbbackup/tests/test_dbbackup.py
@@ -0,0 +1,163 @@
+from fixtures import *  # noqa: F401,F403
+from flaky import flaky  # noqa: F401
+from lightning import RpcError
+from utils import DEVELOPER, wait_for
+
+import os
+import pytest
+import re
+import shutil
+import signal
+import time
+import unittest
+
+
+# Crashing or shutting-down a node raises unpredictable errors/exceptions, thus @flaky
+@flaky
+def test_dbbackup_init(node_factory, executor):
+    """Test plugin init: option --db-backup-file present, correct path and check/compare existing backup"""
+
+    # Option `--db-backup-file` missing, should error and shutdown after start
+    # random_hsm=True so that our stored v102 database is valid
+    l1 = node_factory.get_node(allow_broken_log=True, random_hsm=True, start=False,
+                               options={'plugin': 'tests/dbbackup/dbbackup.py'})
+
+    with pytest.raises(ConnectionResetError):
+        l1.start()
+    # wait_for_log only works on running daemon and ours is exiting with rpc.stop()
+    time.sleep(3)
+    assert l1.daemon.is_in_log(r'\*\*BROKEN\*\* plugin-dbbackup.py No db-backup-file specified', start=l1.daemon.logsearch_start)
+
+    # Now with an invalid file path (a directory), should error and shutdown
+    l1.daemon.opts['db-backup-file'] = node_factory.directory
+    with pytest.raises(ConnectionResetError):
+        l1.start()
+    assert l1.daemon.is_in_log(r'\*\*BROKEN\*\* plugin-dbbackup.py Could not create db-backup-file', start=l1.daemon.logsearch_start)
+
+    # Create proper backup
+    backup = os.path.join(node_factory.directory, "lightningd.sqlite3-backup")
+    l1.daemon.opts['db-backup-file'] = backup
+    l1.start()
+    assert l1.daemon.is_in_log('plugin-dbbackup.py Creating new db-backup-file: {}'.format(backup), start=l1.daemon.logsearch_start)
+    assert l1.daemon.is_in_log(r'plugin-dbbackup.py Initialized', start=l1.daemon.logsearch_start)
+
+    # Disable the plugin, restart and trigger db change so it will differ from the backup
+    l1.stop()
+    del l1.daemon.opts['plugin']
+    del l1.daemon.opts['db-backup-file']
+    l1.start()
+    l1.rpc.newaddr()
+
+    # Re-enable plugin and restart, should error and shutdown
+    l1.stop()
+    l1.daemon.opts['plugin'] = 'tests/dbbackup/dbbackup.py'
+    l1.daemon.opts['db-backup-file'] = backup
+    l1.start()
+
+    needle = l1.daemon.logsearch_start
+    db = os.path.join(l1.daemon.lightning_dir, "lightningd.sqlite3")
+    time.sleep(2)
+    assert l1.daemon.is_in_log(r'Found existing db-backup-file: {} comparing...'.format(backup, db), start=needle)
+    assert l1.daemon.is_in_log(r'\*\*BROKEN\*\* plugin-dbbackup.py Existing db-backup-file differs from original database', start=needle)
+    assert l1.daemon.is_in_log(r'UNUSUAL lightningd(.*): JSON-RPC shutdown', start=needle)
+
+
+@unittest.skipIf(not DEVELOPER, "needs DEVELOPER=1")
+def test_dbbackup_recover(node_factory, executor):
+    """Tests db backup plugin, recover from an unfortunate loss of database."""
+
+    # l3 is our unfortunate, may_reconnect=False prevents reconnect-attempts,
+    # but incoming or manual connections still work
+    db_backup = os.path.join(node_factory.directory, "l3_lightningd.sqlite3-backup")
+    opts = [{'may_reconnect': True},
+            {'may_reconnect': False, 'may_fail': True},
+            {'may_reconnect': False, 'may_fail': True,
+             'plugin': 'tests/dbbackup/dbbackup.py',
+             'db-backup-file': db_backup,
+             'disconnect': ['@WIRE_UPDATE_FULFILL_HTLC']}]
+
+    # l3 looses its database with a beneficial HTLC in flight
+    l1, l2, l3 = node_factory.line_graph(3, opts=opts, wait_for_announce=True)
+    l1.wait_for_route(l3)
+
+    phash = l3.rpc.invoice(123000, 'test_pay', 'description')['payment_hash']
+    route = l1.rpc.getroute(l3.info['id'], 123000, 1)['route']
+    l1.rpc.sendpay(route, phash)
+    l3.daemon.wait_for_log('Peer transient failure in CHANNELD_NORMAL')
+
+    # Crash l3 and replace its database with the backup, restart and reconnect
+    # FIXME Make it dev-crash ?
+    l3.daemon.kill()
+    db_orig = os.path.join(l3.daemon.lightning_dir, 'lightningd.sqlite3')
+    os.rename(db_backup, db_orig)
+    l3.daemon.opts.pop('dev-disconnect')
+    l3.daemon.opts.pop('dev-no-reconnect')
+    assert l1.rpc.listsendpays(payment_hash=phash)['payments'][0]['status'] == 'pending'
+    l3.start()
+    l2.rpc.connect(l3.info['id'], 'localhost', l3.port)['id']
+    wait_for(lambda: l1.rpc.listsendpays(payment_hash=phash)['payments'][0]['status'] == 'complete')
+
+    # a HACK to get around `ValueError: 2 nodes had unexpected reconnections`
+    l3.daemon.logs = [re.sub('Peer has reconnected', 'MODDED_PeerHasReconnected', l) for l in l3.daemon.logs]
+    l2.daemon.logs = [re.sub('Peer has reconnected', 'MODDED_PeerHasReconnected', l) for l in l2.daemon.logs]
+
+    # TODO: Can we come up with something harder?
+    # Is option_data_loss_protect: doing anything?
+
+
+def test_dbbackup_write_fail(node_factory, executor):
+    """When the plugin cannot write to backup file for some reason"""
+
+    backup = os.path.join(node_factory.directory, "lightningd.sqlite3-backup")
+    l1 = node_factory.get_node(allow_broken_log=True,
+                               options={'plugin': 'tests/dbbackup/dbbackup.py',
+                                        'db-backup-file': backup})
+
+    # rename backup file will cause db_write failure and crash lightningd
+    os.rename(backup, backup + '_')
+    with pytest.raises(RpcError):
+        l1.rpc.newaddr()        # Trigger a (post-init) database change
+    # cannot use wait_for_log because daemon is dying
+    assert l1.daemon.is_in_log(r'\*\*BROKEN\*\* plugin-dbbackup.py Failed to write to backup:', start=l1.daemon.logsearch_start)
+
+    # un-rename backup file and restart, also tests that failed write
+    # was not committed-to in original database
+    os.rename(backup + '_', backup)
+    l1.start()
+    l1.daemon.wait_for_logs([r'plugin-dbbackup.py Existing db-backup-file OK and successfully synced',
+                             r'plugin-dbbackup.py Initialized'])
+
+
+def test_dbbackup_migrate(node_factory, executor):
+    """When migrating from an older database version"""
+
+    # Create node with a copy of an old (v102) database and its backup
+    backup = os.path.join(node_factory.directory, "lightningd.sqlite3-backup")
+    l1 = node_factory.get_node(start=False,
+                               options={'plugin': 'tests/dbbackup/dbbackup.py',
+                                        'db-backup-file': backup})
+
+    db = os.path.join(l1.daemon.lightning_dir, "lightningd.sqlite3")
+    shutil.copy('tests/dbbackup/tests/lightningd-v102.sqlite3', db)
+    shutil.copy('tests/dbbackup/tests/lightningd-v102.sqlite3-backup', backup)
+    l1.start()
+    # `Updating database...` happens before current log cursor
+    assert l1.daemon.is_in_log(r'Updating database from version 102 to')
+    l1.daemon.wait_for_log(r'Existing db-backup-file OK and successfully synced')
+
+
+def test_dbbackup_plugin_kill(node_factory, executor):
+    """When the plugin dies unexpectedly, lightningd dies also"""
+
+    backup = os.path.join(node_factory.directory, "lightningd.sqlite3-backup")
+    l1 = node_factory.get_node(may_fail=True, allow_broken_log=True,
+                               options={'plugin': 'tests/dbbackup/dbbackup.py',
+                                        'db-backup-file': backup})
+
+    # kill the plugin, be a bit careful extracting pid from log
+    logline = l1.daemon.is_in_log('plugin-manager started\(\d+\).*dbbackup.py')
+    assert logline is not None
+    pid = int(re.search(r'plugin-manager started\((\d+)\).*dbbackup.py', logline).group(1))
+    os.kill(pid, signal.SIGTERM)
+    time.sleep(2)
+    assert l1.daemon.is_in_log(r'\*\*BROKEN\*\* .*')