[erlang-questions] how: mnesia with simultaneous permanent node failure (EC2)
Paul Mineiro
paul-trapexit@REDACTED
Wed Dec 12 01:29:26 CET 2007
in order to get my disaster recovery situation managed on EC2, i went
ahead and wrote mnesia_schema:del_table_copies/2, which allows one to
remove multiple tables in one operation.
attached are the patches to mnesia_schema.erl and mnesia_dumper.erl (also,
to be pleasant should these concepts be accepted, mnesia.erl should be
patched so that mnesia:del_table_copies/2 exists, but the meat is here).
also attached are the original script indicating "2 out of 3" node failure
where recovery was not possible, and a new script demonstrating the use of
the new call.
i'm not an mnesia expert so for all i know this is a really bad idea for
reasons i'm not seeing at the moment. therefore, any input from mnesia
gurus would be greatly appreciated.
thanks,
-- p
On Sat, 1 Dec 2007, Paul Mineiro wrote:
> hi.
>
> i'm thinking about using mnesia on EC2, but i'm having problems figuring
> out disaster recovery.
>
> some background: on EC2, you can start as many machines (instances) as you
> like. if you lose one, you can start another, but it'll have a different
> hostname. when you lose an instance, you lose whatever was stored on the
> drive.
>
> i was trying to figure out what i would do with a distributed mnesia
> database when i lost a node. i came up with a procedure based upon
> mnesia:del_table_copy/2 of the schema on the lost node which seems to work
> (attached as test-disaster-one).
>
> however when i tried to apply the procedure to simultaneous loss of two
> nodes, i ran into a problem; calling mnesia:del_table_copy/2 of schema
> requires all other nodes to be active, and is this scenario i have lost
> two nodes simultaneously (attached as test-disaster-two).
>
> any input from mnesia gurus would be greatly appreciated.
>
> thanks,
>
> -- p
>
> Optimism is an essential ingredient of innovation. How else can the
> individual favor change over security?
>
> -- Robert Noyce
Optimism is an essential ingredient of innovation. How else can the
individual favor change over security?
-- Robert Noyce
-------------- next part --------------
#! /bin/sh
# This script is intended to simulate loss and recovery of two
# EC2 nodes out of a pool of three
#
# Recovery is attempted by:
# 1. calling mnesia_schema:del_table_copies (schema, [LostNodes]) when the node dies
# 2. deleting the mnesia directory on the lost node (with EC2, this is
# automatic, as there is no persistent disk)
# 3. restarting the node
# 4. calling mnesia:add_table_copy (TableName, RecoveredNode) to reinstall
# tables
rm -rf Mnesia*flassy*
rm -rf Mnesia*turgy*
rm -rf Mnesia*warezy*
# first we start three nodes and get them running the same mnesia schema
erl -nostick -pa ../src -setcookie mega -sname flassy -s mnesia -noshell -noinput -eval '
true = register (hello, self ()),
{ atomic, ok } = mnesia:change_table_copy_type (schema, node (), disc_copies),
receive { From, ruthere } -> From ! imok end,
receive after infinity -> ok end
' -s erlang halt &
erl -nostick -pa ../src -setcookie mega -sname wazzup -noshell -noinput -eval '
receive after 1000 -> ok end,
"wazzup@" ++ Host = atom_to_list (node ()),
pong = net_adm:ping (list_to_atom ("flassy@" ++ Host)),
Pid = rpc:call (list_to_atom ("flassy@" ++ Host), erlang, whereis, [ hello ]),
Pid ! { self (), ruthere },
receive imok -> ok end
' -s erlang halt
erl -nostick -pa ../src -setcookie mega -sname turgy -s mnesia -noshell -noinput -eval '
true = register (hello, self ()),
"turgy@" ++ Host = atom_to_list (node ()),
pong = net_adm:ping (list_to_atom ("flassy@" ++ Host)),
{ ok, _ } = mnesia:change_config (extra_db_nodes, erlang:nodes ()),
{ atomic, ok } = mnesia:change_table_copy_type (schema,
node (),
disc_copies),
receive { From, ruthere } -> From ! imok end,
receive after infinity -> ok end
' -s erlang halt &
erl -nostick -pa ../src -setcookie mega -sname wazzup -noshell -noinput -eval '
receive after 1000 -> ok end,
"wazzup@" ++ Host = atom_to_list (node ()),
pong = net_adm:ping (list_to_atom ("turgy@" ++ Host)),
Pid = rpc:call (list_to_atom ("turgy@" ++ Host), erlang, whereis, [ hello ]),
Pid ! { self (), ruthere },
receive imok -> ok end
' -s erlang halt
erl -nostick -pa ../src -setcookie mega -sname warezy -s mnesia -noshell -noinput -eval '
true = register (hello, self ()),
"warezy@" ++ Host = atom_to_list (node ()),
pong = net_adm:ping (list_to_atom ("flassy@" ++ Host)),
pong = net_adm:ping (list_to_atom ("turgy@" ++ Host)),
{ ok, _ } = mnesia:change_config (extra_db_nodes, erlang:nodes ()),
{ atomic, ok } = mnesia:change_table_copy_type (schema,
node (),
disc_copies),
{ atomic, ok } =
mnesia:create_table (flass,
[ { disc_copies,
[ list_to_atom ("flassy@" ++ Host),
list_to_atom ("turgy@" ++ Host),
list_to_atom ("warezy@" ++ Host) ] } ]),
receive { From, ruthere } -> From ! imok end,
receive after infinity -> ok end
' -s erlang halt &
erl -nostick -pa ../src -setcookie mega -sname wazzup -noshell -noinput -eval '
receive after 1000 -> ok end,
"wazzup@" ++ Host = atom_to_list (node ()),
pong = net_adm:ping (list_to_atom ("warezy@" ++ Host)),
Pid = rpc:call (list_to_atom ("warezy@" ++ Host), erlang, whereis, [ hello ]),
Pid ! { self (), ruthere },
receive imok -> ok end
' -s erlang halt
# now kill flassy and turgy and remove them from the schema
erl -nostick -pa ../src -setcookie mega -sname killah -noshell -noinput -eval '
"killah@" ++ Host = atom_to_list (node ()),
rpc:call (list_to_atom ("flassy@" ++ Host), erlang, halt, []),
rpc:call (list_to_atom ("turgy@" ++ Host), erlang, halt, []),
{ atomic, ok } =
rpc:call (list_to_atom ("warezy@" ++ Host),
mnesia_schema,
del_table_copies,
[ schema, [ list_to_atom ("flassy@" ++ Host),
list_to_atom ("turgy@" ++ Host) ] ])
' -s erlang halt
rm -rf Mnesia*flassy*
rm -rf Mnesia*turgy*
# now restart turgy
erl -nostick -pa ../src -setcookie mega -sname turgy -s mnesia -noshell -noinput -eval '
true = register (hello, self ()),
"turgy@" ++ Host = atom_to_list (node ()),
pong = net_adm:ping (list_to_atom ("warezy@" ++ Host)),
{ ok, _ } = mnesia:change_config (extra_db_nodes, erlang:nodes ()),
{ atomic, ok } = mnesia:change_table_copy_type (schema,
node (),
disc_copies),
{ atomic, ok } = mnesia:add_table_copy (flass, node (), ram_copies),
receive { From, ruthere } -> From ! imok end,
receive after infinity -> ok end
' -s erlang halt &
erl -nostick -pa ../src -setcookie mega -sname wazzup -noshell -noinput -eval '
receive after 1000 -> ok end,
"wazzup@" ++ Host = atom_to_list (node ()),
pong = net_adm:ping (list_to_atom ("turgy@" ++ Host)),
Pid = rpc:call (list_to_atom ("turgy@" ++ Host), erlang, whereis, [ hello ]),
Pid ! { self (), ruthere },
receive imok -> ok end
' -s erlang halt
# now restart flassy
erl -nostick -pa ../src -setcookie mega -sname flassy -s mnesia -noshell -noinput -eval '
"flassy@" ++ Host = atom_to_list (node ()),
pong = net_adm:ping (list_to_atom ("turgy@" ++ Host)),
pong = net_adm:ping (list_to_atom ("warezy@" ++ Host)),
{ ok, _ } = mnesia:change_config (extra_db_nodes, erlang:nodes ()),
{ atomic, ok } = mnesia:change_table_copy_type (schema,
node (),
disc_copies),
{ atomic, ok } = mnesia:add_table_copy (flass, node (), ram_copies),
mnesia:system_info ()
' -s erlang halt
erl -nostick -pa ../src -setcookie mega -sname killah -noshell -noinput -eval '
"killah@" ++ Host = atom_to_list (node ()),
rpc:call (list_to_atom ("flassy@" ++ Host), erlang, halt, []),
rpc:call (list_to_atom ("turgy@" ++ Host), erlang, halt, []),
rpc:call (list_to_atom ("warezy@" ++ Host), erlang, halt, [])
' -s erlang halt
rm -rf Mnesia*flassy*
rm -rf Mnesia*turgy*
rm -rf Mnesia*warezy*
-------------- next part --------------
#! /bin/sh
# This script is intended to simulate loss and recovery of two
# EC2 nodes out of a pool of three
#
# Recovery is attempted by:
# 1. calling mnesia:del_table_copy (schema, LostNode) when the node dies
# 2. deleting the mnesia directory on the lost node (with EC2, this is
# automatic, as there is no persistent disk)
# 3. restarting the node
# 4. calling mnesia:add_table_copy (TableName, RecoveredNode) to reinstall
# tables
#
# Unfortunately this fails because the simultaneous failure means that
# step 1. is not allowed because
# "All replicas on diskfull nodes are not active yet"
rm -rf Mnesia*flassy*
rm -rf Mnesia*turgy*
rm -rf Mnesia*warezy*
# first we start three nodes and get them running the same mnesia schema
erl -setcookie mega -sname flassy -s mnesia -noshell -noinput -eval '
true = register (hello, self ()),
{ atomic, ok } = mnesia:change_table_copy_type (schema, node (), disc_copies),
receive { From, ruthere } -> From ! imok end,
receive after infinity -> ok end
' -s erlang halt &
erl -setcookie mega -sname wazzup -noshell -noinput -eval '
receive after 1000 -> ok end,
"wazzup@" ++ Host = atom_to_list (node ()),
pong = net_adm:ping (list_to_atom ("flassy@" ++ Host)),
Pid = rpc:call (list_to_atom ("flassy@" ++ Host), erlang, whereis, [ hello ]),
Pid ! { self (), ruthere },
receive imok -> ok end
' -s erlang halt
erl -setcookie mega -sname turgy -s mnesia -noshell -noinput -eval '
true = register (hello, self ()),
"turgy@" ++ Host = atom_to_list (node ()),
pong = net_adm:ping (list_to_atom ("flassy@" ++ Host)),
{ ok, _ } = mnesia:change_config (extra_db_nodes, erlang:nodes ()),
{ atomic, ok } = mnesia:change_table_copy_type (schema,
node (),
disc_copies),
receive { From, ruthere } -> From ! imok end,
receive after infinity -> ok end
' -s erlang halt &
erl -setcookie mega -sname wazzup -noshell -noinput -eval '
receive after 1000 -> ok end,
"wazzup@" ++ Host = atom_to_list (node ()),
pong = net_adm:ping (list_to_atom ("turgy@" ++ Host)),
Pid = rpc:call (list_to_atom ("turgy@" ++ Host), erlang, whereis, [ hello ]),
Pid ! { self (), ruthere },
receive imok -> ok end
' -s erlang halt
erl -setcookie mega -sname warezy -s mnesia -noshell -noinput -eval '
true = register (hello, self ()),
"warezy@" ++ Host = atom_to_list (node ()),
pong = net_adm:ping (list_to_atom ("flassy@" ++ Host)),
pong = net_adm:ping (list_to_atom ("turgy@" ++ Host)),
{ ok, _ } = mnesia:change_config (extra_db_nodes, erlang:nodes ()),
{ atomic, ok } = mnesia:change_table_copy_type (schema,
node (),
disc_copies),
{ atomic, ok } =
mnesia:create_table (flass,
[ { disc_copies,
[ list_to_atom ("flassy@" ++ Host),
list_to_atom ("turgy@" ++ Host),
list_to_atom ("warezy@" ++ Host) ] } ]),
receive { From, ruthere } -> From ! imok end,
receive after infinity -> ok end
' -s erlang halt &
erl -setcookie mega -sname wazzup -noshell -noinput -eval '
receive after 1000 -> ok end,
"wazzup@" ++ Host = atom_to_list (node ()),
pong = net_adm:ping (list_to_atom ("warezy@" ++ Host)),
Pid = rpc:call (list_to_atom ("warezy@" ++ Host), erlang, whereis, [ hello ]),
Pid ! { self (), ruthere },
receive imok -> ok end
' -s erlang halt
# now kill flassy and turgy and remove them from the schema
erl -setcookie mega -sname killah -noshell -noinput -eval '
"killah@" ++ Host = atom_to_list (node ()),
rpc:call (list_to_atom ("flassy@" ++ Host), erlang, halt, []),
rpc:call (list_to_atom ("turgy@" ++ Host), erlang, halt, []),
{ atomic, ok } =
rpc:call (list_to_atom ("warezy@" ++ Host),
mnesia,
del_table_copy,
[ schema, list_to_atom ("flassy@" ++ Host) ]),
{ atomic, ok } =
rpc:call (list_to_atom ("warezy@" ++ Host),
mnesia,
del_table_copy,
[ schema, list_to_atom ("turgy@" ++ Host) ])
' -s erlang halt
rm -rf Mnesia*flassy*
rm -rf Mnesia*turgy*
# now restart turgy
erl -setcookie mega -sname turgy -s mnesia -noshell -noinput -eval '
true = register (hello, self ()),
"turgy@" ++ Host = atom_to_list (node ()),
pong = net_adm:ping (list_to_atom ("flassy@" ++ Host)),
{ ok, _ } = mnesia:change_config (extra_db_nodes, erlang:nodes ()),
{ atomic, ok } = mnesia:change_table_copy_type (schema,
node (),
disc_copies),
receive { From, ruthere } -> From ! imok end,
receive after infinity -> ok end
' -s erlang halt &
erl -setcookie mega -sname wazzup -noshell -noinput -eval '
receive after 1000 -> ok end,
"wazzup@" ++ Host = atom_to_list (node ()),
pong = net_adm:ping (list_to_atom ("turgy@" ++ Host)),
Pid = rpc:call (list_to_atom ("turgy@" ++ Host), erlang, whereis, [ hello ]),
Pid ! { self (), ruthere },
receive imok -> ok end
' -s erlang halt
# now restart flassy
erl -setcookie mega -sname flassy -s mnesia -noshell -noinput -eval '
"flassy@" ++ Host = atom_to_list (node ()),
pong = net_adm:ping (list_to_atom ("turgy@" ++ Host)),
pong = net_adm:ping (list_to_atom ("warezy@" ++ Host)),
{ ok, _ } = mnesia:change_config (extra_db_nodes, erlang:nodes ()),
{ atomic, ok } = mnesia:change_table_copy_type (schema,
node (),
disc_copies),
{ atomic, ok } = mnesia:add_table_copy (flass, node (), ram_copies),
mnesia:system_info ()
' -s erlang halt
erl -setcookie mega -sname killah -noshell -noinput -eval '
"killah@" ++ Host = atom_to_list (node ()),
rpc:call (list_to_atom ("flassy@" ++ Host), erlang, halt, []),
rpc:call (list_to_atom ("turgy@" ++ Host), erlang, halt, []),
rpc:call (list_to_atom ("warezy@" ++ Host), erlang, halt, [])
' -s erlang halt
rm -rf Mnesia*flassy*
rm -rf Mnesia*turgy*
rm -rf Mnesia*warezy*
-------------- next part --------------
--- /sw/lib/erlang/lib/mnesia-4.3.5/src/mnesia_dumper.erl 2007-06-11 04:31:23.000000000 -0700
+++ mnesia_dumper.erl 2007-12-11 16:16:26.000000000 -0800
@@ -41,7 +41,7 @@
%% Internal stuff
-export([regulator_init/1]).
--include("mnesia.hrl").
+-include_lib("mnesia/src/mnesia.hrl").
-include_lib("kernel/include/file.hrl").
-import(mnesia_lib, [fatal/2, dbg_out/2]).
@@ -826,6 +826,16 @@
end
end;
+insert_op(Tid, _, {op, del_table_copies, NStorList, TabDef}, InPlace, InitBy) ->
+ lists:foreach(fun({Node, Storage}) ->
+ insert_op(Tid,
+ void,
+ {op, del_table_copy, Storage, Node, TabDef},
+ InPlace,
+ InitBy)
+ end,
+ NStorList);
+
insert_op(Tid, _, {op, add_table_copy, _Storage, _Node, TabDef}, InPlace, InitBy) ->
%% During prepare commit, the files was created
%% and the replica was announced
-------------- next part --------------
--- mnesia-4.3.5/src/mnesia_schema.erl.orig 2007-03-27 06:37:32.000000000 -0700
+++ mnesia-4.3.5/src/mnesia_schema.erl 2007-11-19 11:21:04.000000000 -0800
@@ -2462,7 +2462,7 @@
InitR = #r{opaque = Opaque, module = Module},
case catch lists:foldl(fun check_restore_arg/2, InitR, Args) of
R when record(R, r) ->
- case mnesia_bup:read_schema(Module, Opaque) of
+ case mnesia_bup:read_schema(R#r.module, Opaque) of
{error, Reason} ->
{aborted, Reason};
BupSchema ->
More information about the erlang-questions
mailing list