Skip to content

dbbackup: A minimal database backup plugin #40

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions dbbackup/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
## Summary plugin

This plugin keeps a synchronized backup of c-lightning's (CL) sqlite3 database.
It uses the db_write hook so that every commit (write) to CL's database is first
written to the backup. This allows recovery of any committed-to channel state,
including HTLCs. This plugin does not backup the seed and is not a complete
node-backup.

## Installation

For general plugin installation instructions see the repos main
[README.md](https://github.com/lightningd/plugins/blob/master/README.md#Installation)

## Options:

* `--db-backup-file`: path of the backup file

## Usage

If the given `db-backup-file` doesn't exist yet, it will be created from a
copy of CL's database.

During startup, any existing backup file is checked to match CL's current
database. If that check fails or initialization fails for other reasons, it will
shutdown CL and log `**BROKEN**`. If the plugin fails in writing to the backup
file, it will trigger CL to crash.

The backup file is created with rw permission for the owner, it contains
sensitive information, so be a bit careful. When plugin complains about mismatch
between backup and original db, please investigate what caused it before
recovering.

To recover: shutdown CL and copy the backup to `~/.lighting/lightningd.sqlite3`
File permissions may need to be restored.

## Testing

The tests uses c-lightning's pytest framework. To run the tests, you can
link or copy this repository's `/dbbackup` directory into c-lightning repo's
`/test` directory. Then cd into the c-lighting repo directory and to run
test_dbbackup_* tests, run: `DEVELOPER=1 py.test tests/ -s -v -k test_dbbackup_`
Empty file added dbbackup/__init__.py
Empty file.
102 changes: 102 additions & 0 deletions dbbackup/dbbackup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#!/usr/bin/env python3
from lightning import Plugin
import os
import shutil
import sqlite3
from stat import S_IRUSR, S_IWUSR

plugin = Plugin()
plugin.sqlite_pre_init_cmds = []
plugin.initted = False


class NewBackupFileError(Exception):
def __init__(self, path, e):
self.message = 'Could not create db-backup-file {} : {}'.format(path, e)


# Create fresh backup, started as a copy
def new_backup_file(db, backup):
try:
shutil.copyfile(db, backup)
os.chmod(backup, S_IRUSR | S_IWUSR) # rw permission for owner
plugin.log('Creating new db-backup-file: {}'.format(backup))
except Exception as e:
raise NewBackupFileError(backup, e)


def compare_dbs(db1, db2):
for a, b in zip(db1.iterdump(), db2.iterdump()):
if a != b:
return False
return True


@plugin.init()
def init(configuration, options, plugin):
# FIXME: `==` should be changed to `is not None`, see workaround below
if plugin.get_option('db-backup-file') == '':
plugin.log('No db-backup-file specified', 'error')
plugin.rpc.stop() # stop lightningd

try:
db = os.path.join(configuration['lightning-dir'], 'lightningd.sqlite3')
backup = plugin.get_option('db-backup-file')

# If backup exist, replay pre_init_cmds on a temporary copy
if os.path.isfile(backup):
plugin.log('Found existing db-backup-file: {} comparing...'.format(backup))
backup_copy = shutil.copy(backup, backup + '.tmp')
db1 = sqlite3.connect(backup_copy, isolation_level=None)
db2 = sqlite3.connect('file:{}?mode=ro'.format(db), uri=True) # open in read-only
for c in plugin.sqlite_pre_init_cmds:
db1.execute(c)

# If it then matches orignal db, replace backup with copy ... else abort
dbs_match = compare_dbs(db1, db2)
db1.close()
db2.close()
if dbs_match:
os.rename(backup_copy, backup)
plugin.log("Existing db-backup-file OK and successfully synced")
else:
plugin.log("Existing db-backup-file differs from original database, i.e. applying"

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A better handling of this would be to just outright copy the original over the backup. Though it would not be safe to do so in the init as that is async. Instead, on the first db_write after init, that is when you do your overwriting the backup with the original. Though this is sensitive to the order in which C-Lightning writes to its own original copy (whether it does the write before db_write or after db_write). Might be more complicated than what you are willing to implement and I suppose the operator can manually do the copy in this edge case while C-Lightning is not running.

" pre-init statements (to a copy) didn't make it match the original db", 'error')
os.remove(backup_copy)
plugin.rpc.stop() # stop lightningd

else:
new_backup_file(db, backup)

plugin.conn = sqlite3.connect(backup, isolation_level=None)
plugin.initted = True
plugin.log('Initialized')
except Exception as e:
if isinstance(e, NewBackupFileError):
plugin.log(e.message, 'error')
else:
plugin.log('Initialization failed: {}'.format(e), 'error')

plugin.rpc.stop() # stop lightningd


@plugin.hook('db_write')
def db_write(plugin, writes):
if not plugin.initted:
plugin.sqlite_pre_init_cmds += writes

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is a potential (but unlikely) race condition where db_write is invoked before init, then C-lightning crashes before it is able to init the plugins. It may be safer to save these in a temporary file that you clear out once init is invoked.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch, though writes before the init call will all be migrations iirc, so these are not catastrophic.

Copy link

@ZmnSCPxj ZmnSCPxj Aug 16, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not keeping up with C-lightning upgrades is "not catastrophic"? Upgrades are a risk by themselves as they represent changed code, and the user might be running on some obscure combination of system components that happen to work on older releases of C-lightning but not on newer ones, so a new release (which is when we are likely to have migrations running) is at increased risk of crashes before init... shrug

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ZmnSCPxj Thanks, there is also a potential dead-lock when the plugin makes the rpc.stop() call from init while at the same time lightningd is waiting for a db_write hook to return. TBH I was totally unaware that plugin with (sync) hooks shouldn't make any (sync) RPC calls, at least not from the same threat that also handles the hook.

If I understand correctly, RPC calls can be send after jsonrpc_listen but are not be answered until the main io_loop_with_timers

To solve these and similar issues, I am exploring a sync init in plugins_config where lightningd waits for the init response of certain (perhaps all static?) plugins before continue startup, in a similar way as it does for getmanifest. And maybe shutdown CL when a plugin's init returns False? Will make a PR if that doesn't break too much.

else:
for c in writes:
try:
plugin.conn.execute(c)
except Exception as e:
plugin.log('Failed to write to backup: {}, SQL-statement: {}'.format(e, c), 'error')
# This will `FATAL SIGNAL 6` crash lightningd, but it ensures the failed write
# (here) to backup is also not committed-to in the original database
return False

return True


# Workaround for empty or absent option being (incorrectly?) passed as `null`
plugin.add_option('db-backup-file', '', 'The database backup file.')
plugin.run()
1 change: 1 addition & 0 deletions dbbackup/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pylightning>=0.0.7.3
Empty file added dbbackup/tests/__init__.py
Empty file.
Binary file added dbbackup/tests/lightningd-v102.sqlite3
Binary file not shown.
Binary file added dbbackup/tests/lightningd-v102.sqlite3-backup
Binary file not shown.
163 changes: 163 additions & 0 deletions dbbackup/tests/test_dbbackup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
from fixtures import * # noqa: F401,F403
from flaky import flaky # noqa: F401
from lightning import RpcError
from utils import DEVELOPER, wait_for

import os
import pytest
import re
import shutil
import signal
import time
import unittest


# Crashing or shutting-down a node raises unpredictable errors/exceptions, thus @flaky
@flaky
def test_dbbackup_init(node_factory, executor):
"""Test plugin init: option --db-backup-file present, correct path and check/compare existing backup"""

# Option `--db-backup-file` missing, should error and shutdown after start
# random_hsm=True so that our stored v102 database is valid
l1 = node_factory.get_node(allow_broken_log=True, random_hsm=True, start=False,
options={'plugin': 'tests/dbbackup/dbbackup.py'})

with pytest.raises(ConnectionResetError):
l1.start()
# wait_for_log only works on running daemon and ours is exiting with rpc.stop()
time.sleep(3)
assert l1.daemon.is_in_log(r'\*\*BROKEN\*\* plugin-dbbackup.py No db-backup-file specified', start=l1.daemon.logsearch_start)

# Now with an invalid file path (a directory), should error and shutdown
l1.daemon.opts['db-backup-file'] = node_factory.directory
with pytest.raises(ConnectionResetError):
l1.start()
assert l1.daemon.is_in_log(r'\*\*BROKEN\*\* plugin-dbbackup.py Could not create db-backup-file', start=l1.daemon.logsearch_start)

# Create proper backup
backup = os.path.join(node_factory.directory, "lightningd.sqlite3-backup")
l1.daemon.opts['db-backup-file'] = backup
l1.start()
assert l1.daemon.is_in_log('plugin-dbbackup.py Creating new db-backup-file: {}'.format(backup), start=l1.daemon.logsearch_start)
assert l1.daemon.is_in_log(r'plugin-dbbackup.py Initialized', start=l1.daemon.logsearch_start)

# Disable the plugin, restart and trigger db change so it will differ from the backup
l1.stop()
del l1.daemon.opts['plugin']
del l1.daemon.opts['db-backup-file']
l1.start()
l1.rpc.newaddr()

# Re-enable plugin and restart, should error and shutdown
l1.stop()
l1.daemon.opts['plugin'] = 'tests/dbbackup/dbbackup.py'
l1.daemon.opts['db-backup-file'] = backup
l1.start()

needle = l1.daemon.logsearch_start
db = os.path.join(l1.daemon.lightning_dir, "lightningd.sqlite3")
time.sleep(2)
assert l1.daemon.is_in_log(r'Found existing db-backup-file: {} comparing...'.format(backup, db), start=needle)
assert l1.daemon.is_in_log(r'\*\*BROKEN\*\* plugin-dbbackup.py Existing db-backup-file differs from original database', start=needle)
assert l1.daemon.is_in_log(r'UNUSUAL lightningd(.*): JSON-RPC shutdown', start=needle)


@unittest.skipIf(not DEVELOPER, "needs DEVELOPER=1")
def test_dbbackup_recover(node_factory, executor):
"""Tests db backup plugin, recover from an unfortunate loss of database."""

# l3 is our unfortunate, may_reconnect=False prevents reconnect-attempts,
# but incoming or manual connections still work
db_backup = os.path.join(node_factory.directory, "l3_lightningd.sqlite3-backup")
opts = [{'may_reconnect': True},
{'may_reconnect': False, 'may_fail': True},
{'may_reconnect': False, 'may_fail': True,
'plugin': 'tests/dbbackup/dbbackup.py',
'db-backup-file': db_backup,
'disconnect': ['@WIRE_UPDATE_FULFILL_HTLC']}]

# l3 looses its database with a beneficial HTLC in flight
l1, l2, l3 = node_factory.line_graph(3, opts=opts, wait_for_announce=True)
l1.wait_for_route(l3)

phash = l3.rpc.invoice(123000, 'test_pay', 'description')['payment_hash']
route = l1.rpc.getroute(l3.info['id'], 123000, 1)['route']
l1.rpc.sendpay(route, phash)
l3.daemon.wait_for_log('Peer transient failure in CHANNELD_NORMAL')

# Crash l3 and replace its database with the backup, restart and reconnect
# FIXME Make it dev-crash ?
l3.daemon.kill()
db_orig = os.path.join(l3.daemon.lightning_dir, 'lightningd.sqlite3')
os.rename(db_backup, db_orig)
l3.daemon.opts.pop('dev-disconnect')
l3.daemon.opts.pop('dev-no-reconnect')
assert l1.rpc.listsendpays(payment_hash=phash)['payments'][0]['status'] == 'pending'
l3.start()
l2.rpc.connect(l3.info['id'], 'localhost', l3.port)['id']
wait_for(lambda: l1.rpc.listsendpays(payment_hash=phash)['payments'][0]['status'] == 'complete')

# a HACK to get around `ValueError: 2 nodes had unexpected reconnections`
l3.daemon.logs = [re.sub('Peer has reconnected', 'MODDED_PeerHasReconnected', l) for l in l3.daemon.logs]
l2.daemon.logs = [re.sub('Peer has reconnected', 'MODDED_PeerHasReconnected', l) for l in l2.daemon.logs]

# TODO: Can we come up with something harder?
# Is option_data_loss_protect: doing anything?


def test_dbbackup_write_fail(node_factory, executor):
"""When the plugin cannot write to backup file for some reason"""

backup = os.path.join(node_factory.directory, "lightningd.sqlite3-backup")
l1 = node_factory.get_node(allow_broken_log=True,
options={'plugin': 'tests/dbbackup/dbbackup.py',
'db-backup-file': backup})

# rename backup file will cause db_write failure and crash lightningd
os.rename(backup, backup + '_')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does this cause a failure writing to the DB? I thought sqlite3 keeps an fd open to the db file, which is unaffected by moves.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have no idea, but it works . The error returned by sqlite3: attempt to write a readonly database

Any other ideas how to emulate a disk-full scenario? (maybe loop device)

with pytest.raises(RpcError):
l1.rpc.newaddr() # Trigger a (post-init) database change
# cannot use wait_for_log because daemon is dying
assert l1.daemon.is_in_log(r'\*\*BROKEN\*\* plugin-dbbackup.py Failed to write to backup:', start=l1.daemon.logsearch_start)

# un-rename backup file and restart, also tests that failed write
# was not committed-to in original database
os.rename(backup + '_', backup)
l1.start()
l1.daemon.wait_for_logs([r'plugin-dbbackup.py Existing db-backup-file OK and successfully synced',
r'plugin-dbbackup.py Initialized'])


def test_dbbackup_migrate(node_factory, executor):
"""When migrating from an older database version"""

# Create node with a copy of an old (v102) database and its backup
backup = os.path.join(node_factory.directory, "lightningd.sqlite3-backup")
l1 = node_factory.get_node(start=False,
options={'plugin': 'tests/dbbackup/dbbackup.py',
'db-backup-file': backup})

db = os.path.join(l1.daemon.lightning_dir, "lightningd.sqlite3")
shutil.copy('tests/dbbackup/tests/lightningd-v102.sqlite3', db)
shutil.copy('tests/dbbackup/tests/lightningd-v102.sqlite3-backup', backup)
l1.start()
# `Updating database...` happens before current log cursor
assert l1.daemon.is_in_log(r'Updating database from version 102 to')
l1.daemon.wait_for_log(r'Existing db-backup-file OK and successfully synced')


def test_dbbackup_plugin_kill(node_factory, executor):
"""When the plugin dies unexpectedly, lightningd dies also"""

backup = os.path.join(node_factory.directory, "lightningd.sqlite3-backup")
l1 = node_factory.get_node(may_fail=True, allow_broken_log=True,
options={'plugin': 'tests/dbbackup/dbbackup.py',
'db-backup-file': backup})

# kill the plugin, be a bit careful extracting pid from log
logline = l1.daemon.is_in_log('plugin-manager started\(\d+\).*dbbackup.py')
assert logline is not None
pid = int(re.search(r'plugin-manager started\((\d+)\).*dbbackup.py', logline).group(1))
os.kill(pid, signal.SIGTERM)
time.sleep(2)
assert l1.daemon.is_in_log(r'\*\*BROKEN\*\* .*')