[erlang-questions] how: mnesia with simultaneous permanent node failure (EC2)

Paul Mineiro paul-trapexit@REDACTED
Wed Dec 12 01:29:26 CET 2007


in order to get my disaster recovery situation managed on EC2, i went
ahead and wrote mnesia_schema:del_table_copies/2, which allows one to
remove multiple tables in one operation.

attached are the patches to mnesia_schema.erl and mnesia_dumper.erl (also,
to be pleasant should these concepts be accepted, mnesia.erl should be
patched so that mnesia:del_table_copies/2 exists, but the meat is here).

also attached are the original script indicating "2 out of 3" node failure
where recovery was not possible, and a new script demonstrating the use of
the new call.

i'm not an mnesia expert so for all i know this is a really bad idea for
reasons i'm not seeing at the moment.  therefore, any input from mnesia
gurus would be greatly appreciated.

thanks,

-- p

On Sat, 1 Dec 2007, Paul Mineiro wrote:

> hi.
>
> i'm thinking about using mnesia on EC2, but i'm having problems figuring
> out disaster recovery.
>
> some background: on EC2, you can start as many machines (instances) as you
> like.  if you lose one, you can start another, but it'll have a different
> hostname.  when you lose an instance, you lose whatever was stored on the
> drive.
>
> i was trying to figure out what i would do with a distributed mnesia
> database when i lost a node.  i came up with a procedure based upon
> mnesia:del_table_copy/2 of the schema on the lost node which seems to work
> (attached as test-disaster-one).
>
> however when i tried to apply the procedure to simultaneous loss of two
> nodes, i ran into a problem; calling mnesia:del_table_copy/2 of schema
> requires all other nodes to be active, and is this scenario i have lost
> two nodes simultaneously (attached as test-disaster-two).
>
> any input from mnesia gurus would be greatly appreciated.
>
> thanks,
>
> -- p
>
> Optimism is an essential ingredient of innovation. How else can the
> individual favor change over security?
>
>   -- Robert Noyce

Optimism is an essential ingredient of innovation. How else can the
individual favor change over security?

  -- Robert Noyce
-------------- next part --------------
#! /bin/sh

# This script is intended to simulate loss and recovery of two
# EC2 nodes out of a pool of three
#
# Recovery is attempted by:
#   1. calling mnesia_schema:del_table_copies (schema, [LostNodes]) when the node dies
#   2. deleting the mnesia directory on the lost node (with EC2, this is
#      automatic, as there is no persistent disk)
#   3. restarting the node
#   4. calling mnesia:add_table_copy (TableName, RecoveredNode) to reinstall
#      tables

rm -rf Mnesia*flassy*
rm -rf Mnesia*turgy*
rm -rf Mnesia*warezy*

# first we start three nodes and get them running the same mnesia schema

erl -nostick -pa ../src -setcookie mega -sname flassy -s mnesia -noshell -noinput -eval '
  true = register (hello, self ()),
  { atomic, ok } = mnesia:change_table_copy_type (schema, node (), disc_copies),
  receive { From, ruthere } -> From ! imok end,
  receive after infinity -> ok end
' -s erlang halt &

erl -nostick -pa ../src -setcookie mega -sname wazzup -noshell -noinput -eval '
  receive after 1000 -> ok end,
  "wazzup@" ++ Host = atom_to_list (node ()),
  pong = net_adm:ping (list_to_atom ("flassy@" ++ Host)),
  Pid = rpc:call (list_to_atom ("flassy@" ++ Host), erlang, whereis, [ hello ]),
  Pid ! { self (), ruthere },
  receive imok -> ok end
' -s erlang halt

erl -nostick -pa ../src -setcookie mega -sname turgy -s mnesia -noshell -noinput -eval '
  true = register (hello, self ()),
  "turgy@" ++ Host = atom_to_list (node ()),
  pong = net_adm:ping (list_to_atom ("flassy@" ++ Host)),
  { ok, _ } = mnesia:change_config (extra_db_nodes, erlang:nodes ()),
  { atomic, ok } = mnesia:change_table_copy_type (schema,
                                                  node (),
                                                  disc_copies),
  receive { From, ruthere } -> From ! imok end,
  receive after infinity -> ok end
' -s erlang halt &

erl -nostick -pa ../src -setcookie mega -sname wazzup -noshell -noinput -eval '
  receive after 1000 -> ok end,
  "wazzup@" ++ Host = atom_to_list (node ()),
  pong = net_adm:ping (list_to_atom ("turgy@" ++ Host)),
  Pid = rpc:call (list_to_atom ("turgy@" ++ Host), erlang, whereis, [ hello ]),
  Pid ! { self (), ruthere },
  receive imok -> ok end
' -s erlang halt

erl -nostick -pa ../src -setcookie mega -sname warezy -s mnesia -noshell -noinput -eval '
  true = register (hello, self ()),
  "warezy@" ++ Host = atom_to_list (node ()),
  pong = net_adm:ping (list_to_atom ("flassy@" ++ Host)),
  pong = net_adm:ping (list_to_atom ("turgy@" ++ Host)),
  { ok, _ } = mnesia:change_config (extra_db_nodes, erlang:nodes ()),
  { atomic, ok } = mnesia:change_table_copy_type (schema,
                                                  node (),
                                                  disc_copies),
  { atomic, ok } = 
    mnesia:create_table (flass,
                         [ { disc_copies, 
                             [ list_to_atom ("flassy@" ++ Host),
                               list_to_atom ("turgy@" ++ Host),
                               list_to_atom ("warezy@" ++ Host) ] } ]),
  receive { From, ruthere } -> From ! imok end,
  receive after infinity -> ok end
' -s erlang halt &

erl -nostick -pa ../src -setcookie mega -sname wazzup -noshell -noinput -eval '
  receive after 1000 -> ok end,
  "wazzup@" ++ Host = atom_to_list (node ()),
  pong = net_adm:ping (list_to_atom ("warezy@" ++ Host)),
  Pid = rpc:call (list_to_atom ("warezy@" ++ Host), erlang, whereis, [ hello ]),
  Pid ! { self (), ruthere },
  receive imok -> ok end
' -s erlang halt

# now kill flassy and turgy and remove them from the schema

erl -nostick -pa ../src -setcookie mega -sname killah -noshell -noinput -eval '
  "killah@" ++ Host = atom_to_list (node ()),
  rpc:call (list_to_atom ("flassy@" ++ Host), erlang, halt, []),
  rpc:call (list_to_atom ("turgy@" ++ Host), erlang, halt, []),
  { atomic, ok } = 
    rpc:call (list_to_atom ("warezy@" ++ Host), 
              mnesia_schema,
              del_table_copies,
              [ schema, [ list_to_atom ("flassy@" ++ Host),
                          list_to_atom ("turgy@" ++ Host) ] ])
' -s erlang halt

rm -rf Mnesia*flassy*
rm -rf Mnesia*turgy*

# now restart turgy

erl -nostick -pa ../src -setcookie mega -sname turgy -s mnesia -noshell -noinput -eval '
  true = register (hello, self ()),
  "turgy@" ++ Host = atom_to_list (node ()),
  pong = net_adm:ping (list_to_atom ("warezy@" ++ Host)),
  { ok, _ } = mnesia:change_config (extra_db_nodes, erlang:nodes ()),
  { atomic, ok } = mnesia:change_table_copy_type (schema,
                                                  node (),
                                                  disc_copies),
  { atomic, ok } = mnesia:add_table_copy (flass, node (), ram_copies),
  receive { From, ruthere } -> From ! imok end,
  receive after infinity -> ok end
' -s erlang halt &

erl -nostick -pa ../src -setcookie mega -sname wazzup -noshell -noinput -eval '
  receive after 1000 -> ok end,
  "wazzup@" ++ Host = atom_to_list (node ()),
  pong = net_adm:ping (list_to_atom ("turgy@" ++ Host)),
  Pid = rpc:call (list_to_atom ("turgy@" ++ Host), erlang, whereis, [ hello ]),
  Pid ! { self (), ruthere },
  receive imok -> ok end
' -s erlang halt

# now restart flassy

erl -nostick -pa ../src -setcookie mega -sname flassy -s mnesia -noshell -noinput -eval '
  "flassy@" ++ Host = atom_to_list (node ()),
  pong = net_adm:ping (list_to_atom ("turgy@" ++ Host)),
  pong = net_adm:ping (list_to_atom ("warezy@" ++ Host)),
  { ok, _ } = mnesia:change_config (extra_db_nodes, erlang:nodes ()),
  { atomic, ok } = mnesia:change_table_copy_type (schema,
                                                  node (),
                                                  disc_copies),
  { atomic, ok } = mnesia:add_table_copy (flass, node (), ram_copies),
  mnesia:system_info ()
' -s erlang halt

erl -nostick -pa ../src -setcookie mega -sname killah -noshell -noinput -eval '
  "killah@" ++ Host = atom_to_list (node ()),
  rpc:call (list_to_atom ("flassy@" ++ Host), erlang, halt, []),
  rpc:call (list_to_atom ("turgy@" ++ Host), erlang, halt, []),
  rpc:call (list_to_atom ("warezy@" ++ Host), erlang, halt, [])
' -s erlang halt

rm -rf Mnesia*flassy*
rm -rf Mnesia*turgy*
rm -rf Mnesia*warezy*

-------------- next part --------------
#! /bin/sh

# This script is intended to simulate loss and recovery of two
# EC2 nodes out of a pool of three
#
# Recovery is attempted by:
#   1. calling mnesia:del_table_copy (schema, LostNode) when the node dies
#   2. deleting the mnesia directory on the lost node (with EC2, this is
#      automatic, as there is no persistent disk)
#   3. restarting the node
#   4. calling mnesia:add_table_copy (TableName, RecoveredNode) to reinstall
#      tables
#
# Unfortunately this fails because the simultaneous failure means that 
# step 1. is not allowed because 
# "All replicas on diskfull nodes are not active yet"

rm -rf Mnesia*flassy*
rm -rf Mnesia*turgy*
rm -rf Mnesia*warezy*

# first we start three nodes and get them running the same mnesia schema

erl -setcookie mega -sname flassy -s mnesia -noshell -noinput -eval '
  true = register (hello, self ()),
  { atomic, ok } = mnesia:change_table_copy_type (schema, node (), disc_copies),
  receive { From, ruthere } -> From ! imok end,
  receive after infinity -> ok end
' -s erlang halt &

erl -setcookie mega -sname wazzup -noshell -noinput -eval '
  receive after 1000 -> ok end,
  "wazzup@" ++ Host = atom_to_list (node ()),
  pong = net_adm:ping (list_to_atom ("flassy@" ++ Host)),
  Pid = rpc:call (list_to_atom ("flassy@" ++ Host), erlang, whereis, [ hello ]),
  Pid ! { self (), ruthere },
  receive imok -> ok end
' -s erlang halt

erl -setcookie mega -sname turgy -s mnesia -noshell -noinput -eval '
  true = register (hello, self ()),
  "turgy@" ++ Host = atom_to_list (node ()),
  pong = net_adm:ping (list_to_atom ("flassy@" ++ Host)),
  { ok, _ } = mnesia:change_config (extra_db_nodes, erlang:nodes ()),
  { atomic, ok } = mnesia:change_table_copy_type (schema,
                                                  node (),
                                                  disc_copies),
  receive { From, ruthere } -> From ! imok end,
  receive after infinity -> ok end
' -s erlang halt &

erl -setcookie mega -sname wazzup -noshell -noinput -eval '
  receive after 1000 -> ok end,
  "wazzup@" ++ Host = atom_to_list (node ()),
  pong = net_adm:ping (list_to_atom ("turgy@" ++ Host)),
  Pid = rpc:call (list_to_atom ("turgy@" ++ Host), erlang, whereis, [ hello ]),
  Pid ! { self (), ruthere },
  receive imok -> ok end
' -s erlang halt

erl -setcookie mega -sname warezy -s mnesia -noshell -noinput -eval '
  true = register (hello, self ()),
  "warezy@" ++ Host = atom_to_list (node ()),
  pong = net_adm:ping (list_to_atom ("flassy@" ++ Host)),
  pong = net_adm:ping (list_to_atom ("turgy@" ++ Host)),
  { ok, _ } = mnesia:change_config (extra_db_nodes, erlang:nodes ()),
  { atomic, ok } = mnesia:change_table_copy_type (schema,
                                                  node (),
                                                  disc_copies),
  { atomic, ok } = 
    mnesia:create_table (flass,
                         [ { disc_copies, 
                             [ list_to_atom ("flassy@" ++ Host),
                               list_to_atom ("turgy@" ++ Host),
                               list_to_atom ("warezy@" ++ Host) ] } ]),
  receive { From, ruthere } -> From ! imok end,
  receive after infinity -> ok end
' -s erlang halt &

erl -setcookie mega -sname wazzup -noshell -noinput -eval '
  receive after 1000 -> ok end,
  "wazzup@" ++ Host = atom_to_list (node ()),
  pong = net_adm:ping (list_to_atom ("warezy@" ++ Host)),
  Pid = rpc:call (list_to_atom ("warezy@" ++ Host), erlang, whereis, [ hello ]),
  Pid ! { self (), ruthere },
  receive imok -> ok end
' -s erlang halt

# now kill flassy and turgy and remove them from the schema

erl -setcookie mega -sname killah -noshell -noinput -eval '
  "killah@" ++ Host = atom_to_list (node ()),
  rpc:call (list_to_atom ("flassy@" ++ Host), erlang, halt, []),
  rpc:call (list_to_atom ("turgy@" ++ Host), erlang, halt, []),
  { atomic, ok } = 
    rpc:call (list_to_atom ("warezy@" ++ Host), 
              mnesia,
              del_table_copy,
              [ schema, list_to_atom ("flassy@" ++ Host) ]),
  { atomic, ok } = 
    rpc:call (list_to_atom ("warezy@" ++ Host), 
              mnesia,
              del_table_copy,
              [ schema, list_to_atom ("turgy@" ++ Host) ])
' -s erlang halt

rm -rf Mnesia*flassy*
rm -rf Mnesia*turgy*

# now restart turgy

erl -setcookie mega -sname turgy -s mnesia -noshell -noinput -eval '
  true = register (hello, self ()),
  "turgy@" ++ Host = atom_to_list (node ()),
  pong = net_adm:ping (list_to_atom ("flassy@" ++ Host)),
  { ok, _ } = mnesia:change_config (extra_db_nodes, erlang:nodes ()),
  { atomic, ok } = mnesia:change_table_copy_type (schema,
                                                  node (),
                                                  disc_copies),
  receive { From, ruthere } -> From ! imok end,
  receive after infinity -> ok end
' -s erlang halt &

erl -setcookie mega -sname wazzup -noshell -noinput -eval '
  receive after 1000 -> ok end,
  "wazzup@" ++ Host = atom_to_list (node ()),
  pong = net_adm:ping (list_to_atom ("turgy@" ++ Host)),
  Pid = rpc:call (list_to_atom ("turgy@" ++ Host), erlang, whereis, [ hello ]),
  Pid ! { self (), ruthere },
  receive imok -> ok end
' -s erlang halt

# now restart flassy

erl -setcookie mega -sname flassy -s mnesia -noshell -noinput -eval '
  "flassy@" ++ Host = atom_to_list (node ()),
  pong = net_adm:ping (list_to_atom ("turgy@" ++ Host)),
  pong = net_adm:ping (list_to_atom ("warezy@" ++ Host)),
  { ok, _ } = mnesia:change_config (extra_db_nodes, erlang:nodes ()),
  { atomic, ok } = mnesia:change_table_copy_type (schema,
                                                  node (),
                                                  disc_copies),
  { atomic, ok } = mnesia:add_table_copy (flass, node (), ram_copies),
  mnesia:system_info ()
' -s erlang halt

erl -setcookie mega -sname killah -noshell -noinput -eval '
  "killah@" ++ Host = atom_to_list (node ()),
  rpc:call (list_to_atom ("flassy@" ++ Host), erlang, halt, []),
  rpc:call (list_to_atom ("turgy@" ++ Host), erlang, halt, []),
  rpc:call (list_to_atom ("warezy@" ++ Host), erlang, halt, [])
' -s erlang halt

rm -rf Mnesia*flassy*
rm -rf Mnesia*turgy*
rm -rf Mnesia*warezy*

-------------- next part --------------
--- /sw/lib/erlang/lib/mnesia-4.3.5/src/mnesia_dumper.erl	2007-06-11 04:31:23.000000000 -0700
+++ mnesia_dumper.erl	2007-12-11 16:16:26.000000000 -0800
@@ -41,7 +41,7 @@
  %% Internal stuff
 -export([regulator_init/1]).
 	
--include("mnesia.hrl").
+-include_lib("mnesia/src/mnesia.hrl").
 -include_lib("kernel/include/file.hrl").
 
 -import(mnesia_lib, [fatal/2, dbg_out/2]).
@@ -826,6 +826,16 @@
 	    end
     end;
 
+insert_op(Tid, _, {op, del_table_copies, NStorList, TabDef}, InPlace, InitBy) ->
+    lists:foreach(fun({Node, Storage}) -> 
+			  insert_op(Tid, 
+				    void, 
+				    {op, del_table_copy, Storage, Node, TabDef},
+				    InPlace,
+				    InitBy)
+		  end,
+		  NStorList);
+
 insert_op(Tid, _, {op, add_table_copy, _Storage, _Node, TabDef}, InPlace, InitBy) ->
     %% During prepare commit, the files was created
     %% and the replica was announced
-------------- next part --------------
--- mnesia-4.3.5/src/mnesia_schema.erl.orig	2007-03-27 06:37:32.000000000 -0700
+++ mnesia-4.3.5/src/mnesia_schema.erl	2007-11-19 11:21:04.000000000 -0800
@@ -2462,7 +2462,7 @@
     InitR = #r{opaque = Opaque, module = Module},
     case catch lists:foldl(fun check_restore_arg/2, InitR, Args) of
 	R when record(R, r) ->
-	    case mnesia_bup:read_schema(Module, Opaque) of
+	    case mnesia_bup:read_schema(R#r.module, Opaque) of
 		{error, Reason} ->
 		    {aborted, Reason};
 		BupSchema -> 


More information about the erlang-questions mailing list