[erlang-questions] how: mnesia with simultaneous permanent node failure (EC2)

Paul Mineiro <>
Sat Dec 1 19:49:31 CET 2007


hi.

i'm thinking about using mnesia on EC2, but i'm having problems figuring
out disaster recovery.

some background: on EC2, you can start as many machines (instances) as you
like.  if you lose one, you can start another, but it'll have a different
hostname.  when you lose an instance, you lose whatever was stored on the
drive.

i was trying to figure out what i would do with a distributed mnesia
database when i lost a node.  i came up with a procedure based upon
mnesia:del_table_copy/2 of the schema on the lost node which seems to work
(attached as test-disaster-one).

however when i tried to apply the procedure to simultaneous loss of two
nodes, i ran into a problem; calling mnesia:del_table_copy/2 of schema
requires all other nodes to be active, and is this scenario i have lost
two nodes simultaneously (attached as test-disaster-two).

any input from mnesia gurus would be greatly appreciated.

thanks,

-- p

Optimism is an essential ingredient of innovation. How else can the
individual favor change over security?

  -- Robert Noyce
-------------- next part --------------
#! /bin/sh

# This script is intended to simulate loss and recovery of a single 
# EC2 node out of a pool of three
#
# Recovery is achieved by:
#   1. calling mnesia:del_table_copy (schema, LostNode) when the node dies
#   2. deleting the mnesia directory on the lost node (with EC2, this is
#      automatic, as there is no persistent disk)
#   3. restarting the node
#   4. calling mnesia:add_table_copy (TableName, RecoveredNode) to reinstall
#      tables

rm -rf Mnesia*flassy*
rm -rf Mnesia*turgy*
rm -rf Mnesia*warezy*

# first we start three nodes and get them running the same mnesia schema

erl -setcookie mega -sname flassy -s mnesia -noshell -noinput -eval '
  true = register (hello, self ()),
  { atomic, ok } = mnesia:change_table_copy_type (schema, node (), disc_copies),
  receive { From, ruthere } -> From ! imok end,
  receive after infinity -> ok end
' &

erl -setcookie mega -sname wazzup -noshell -noinput -eval '
  receive after 1000 -> ok end,
  "wazzup@" ++ Host = atom_to_list (node ()),
  pong = net_adm:ping (list_to_atom ("flassy@" ++ Host)),
  Pid = rpc:call (list_to_atom ("flassy@" ++ Host), erlang, whereis, [ hello ]),
  Pid ! { self (), ruthere },
  receive imok -> ok end
' -s erlang halt

erl -setcookie mega -sname turgy -s mnesia -noshell -noinput -eval '
  true = register (hello, self ()),
  "turgy@" ++ Host = atom_to_list (node ()),
  pong = net_adm:ping (list_to_atom ("flassy@" ++ Host)),
  { ok, _ } = mnesia:change_config (extra_db_nodes, erlang:nodes ()),
  { atomic, ok } = mnesia:change_table_copy_type (schema,
                                                  node (),
                                                  disc_copies),
  receive { From, ruthere } -> From ! imok end,
  receive after infinity -> ok end
' &

erl -setcookie mega -sname wazzup -noshell -noinput -eval '
  receive after 1000 -> ok end,
  "wazzup@" ++ Host = atom_to_list (node ()),
  pong = net_adm:ping (list_to_atom ("turgy@" ++ Host)),
  Pid = rpc:call (list_to_atom ("turgy@" ++ Host), erlang, whereis, [ hello ]),
  Pid ! { self (), ruthere },
  receive imok -> ok end
' -s erlang halt

erl -setcookie mega -sname warezy -s mnesia -noshell -noinput -eval '
  true = register (hello, self ()),
  "warezy@" ++ Host = atom_to_list (node ()),
  pong = net_adm:ping (list_to_atom ("flassy@" ++ Host)),
  pong = net_adm:ping (list_to_atom ("turgy@" ++ Host)),
  { ok, _ } = mnesia:change_config (extra_db_nodes, erlang:nodes ()),
  { atomic, ok } = mnesia:change_table_copy_type (schema,
                                                  node (),
                                                  disc_copies),
  { atomic, ok } = 
    mnesia:create_table (flass,
                         [ { disc_copies, 
                             [ list_to_atom ("flassy@" ++ Host),
                               list_to_atom ("turgy@" ++ Host),
                               list_to_atom ("warezy@" ++ Host) ] } ]),
  receive { From, ruthere } -> From ! imok end,
  receive after infinity -> ok end
' &

erl -setcookie mega -sname wazzup -noshell -noinput -eval '
  receive after 1000 -> ok end,
  "wazzup@" ++ Host = atom_to_list (node ()),
  pong = net_adm:ping (list_to_atom ("warezy@" ++ Host)),
  Pid = rpc:call (list_to_atom ("warezy@" ++ Host), erlang, whereis, [ hello ]),
  Pid ! { self (), ruthere },
  receive imok -> ok end
' -s erlang halt

# now kill flassy and remove it from the schema

erl -setcookie mega -sname killah -noshell -noinput -eval '
  "killah@" ++ Host = atom_to_list (node ()),
  rpc:call (list_to_atom ("flassy@" ++ Host), erlang, halt, []),
  { atomic, ok } = 
    rpc:call (list_to_atom ("warezy@" ++ Host), 
              mnesia,
              del_table_copy,
              [ schema, list_to_atom ("flassy@" ++ Host) ])
' -s erlang halt

rm -rf Mnesia*flassy*

# now restart flassy

erl -setcookie mega -sname flassy -s mnesia -noshell -noinput -eval '
  "flassy@" ++ Host = atom_to_list (node ()),
  pong = net_adm:ping (list_to_atom ("turgy@" ++ Host)),
  pong = net_adm:ping (list_to_atom ("warezy@" ++ Host)),
  { ok, _ } = mnesia:change_config (extra_db_nodes, erlang:nodes ()),
  { atomic, ok } = mnesia:change_table_copy_type (schema,
                                                  node (),
                                                  disc_copies),
  { atomic, ok } = mnesia:add_table_copy (flass, node (), ram_copies),
  mnesia:system_info ()
' -s erlang halt

erl -setcookie mega -sname killah -noshell -noinput -eval '
  "killah@" ++ Host = atom_to_list (node ()),
  rpc:call (list_to_atom ("flassy@" ++ Host), erlang, halt, []),
  rpc:call (list_to_atom ("turgy@" ++ Host), erlang, halt, []),
  rpc:call (list_to_atom ("warezy@" ++ Host), erlang, halt, [])
' -s erlang halt

rm -rf Mnesia*flassy*
rm -rf Mnesia*turgy*
rm -rf Mnesia*warezy*
-------------- next part --------------
#! /bin/sh

# This script is intended to simulate loss and recovery of two
# EC2 nodes out of a pool of three
#
# Recovery is attempted by:
#   1. calling mnesia:del_table_copy (schema, LostNode) when the node dies
#   2. deleting the mnesia directory on the lost node (with EC2, this is
#      automatic, as there is no persistent disk)
#   3. restarting the node
#   4. calling mnesia:add_table_copy (TableName, RecoveredNode) to reinstall
#      tables
#
# Unfortunately this fails because the simultaneous failure means that 
# step 1. is not allowed because 
# "All replicas on diskfull nodes are not active yet"

rm -rf Mnesia*flassy*
rm -rf Mnesia*turgy*
rm -rf Mnesia*warezy*

# first we start three nodes and get them running the same mnesia schema

erl -setcookie mega -sname flassy -s mnesia -noshell -noinput -eval '
  true = register (hello, self ()),
  { atomic, ok } = mnesia:change_table_copy_type (schema, node (), disc_copies),
  receive { From, ruthere } -> From ! imok end,
  receive after infinity -> ok end
' -s erlang halt &

erl -setcookie mega -sname wazzup -noshell -noinput -eval '
  receive after 1000 -> ok end,
  "wazzup@" ++ Host = atom_to_list (node ()),
  pong = net_adm:ping (list_to_atom ("flassy@" ++ Host)),
  Pid = rpc:call (list_to_atom ("flassy@" ++ Host), erlang, whereis, [ hello ]),
  Pid ! { self (), ruthere },
  receive imok -> ok end
' -s erlang halt

erl -setcookie mega -sname turgy -s mnesia -noshell -noinput -eval '
  true = register (hello, self ()),
  "turgy@" ++ Host = atom_to_list (node ()),
  pong = net_adm:ping (list_to_atom ("flassy@" ++ Host)),
  { ok, _ } = mnesia:change_config (extra_db_nodes, erlang:nodes ()),
  { atomic, ok } = mnesia:change_table_copy_type (schema,
                                                  node (),
                                                  disc_copies),
  receive { From, ruthere } -> From ! imok end,
  receive after infinity -> ok end
' -s erlang halt &

erl -setcookie mega -sname wazzup -noshell -noinput -eval '
  receive after 1000 -> ok end,
  "wazzup@" ++ Host = atom_to_list (node ()),
  pong = net_adm:ping (list_to_atom ("turgy@" ++ Host)),
  Pid = rpc:call (list_to_atom ("turgy@" ++ Host), erlang, whereis, [ hello ]),
  Pid ! { self (), ruthere },
  receive imok -> ok end
' -s erlang halt

erl -setcookie mega -sname warezy -s mnesia -noshell -noinput -eval '
  true = register (hello, self ()),
  "warezy@" ++ Host = atom_to_list (node ()),
  pong = net_adm:ping (list_to_atom ("flassy@" ++ Host)),
  pong = net_adm:ping (list_to_atom ("turgy@" ++ Host)),
  { ok, _ } = mnesia:change_config (extra_db_nodes, erlang:nodes ()),
  { atomic, ok } = mnesia:change_table_copy_type (schema,
                                                  node (),
                                                  disc_copies),
  { atomic, ok } = 
    mnesia:create_table (flass,
                         [ { disc_copies, 
                             [ list_to_atom ("flassy@" ++ Host),
                               list_to_atom ("turgy@" ++ Host),
                               list_to_atom ("warezy@" ++ Host) ] } ]),
  receive { From, ruthere } -> From ! imok end,
  receive after infinity -> ok end
' -s erlang halt &

erl -setcookie mega -sname wazzup -noshell -noinput -eval '
  receive after 1000 -> ok end,
  "wazzup@" ++ Host = atom_to_list (node ()),
  pong = net_adm:ping (list_to_atom ("warezy@" ++ Host)),
  Pid = rpc:call (list_to_atom ("warezy@" ++ Host), erlang, whereis, [ hello ]),
  Pid ! { self (), ruthere },
  receive imok -> ok end
' -s erlang halt

# now kill flassy and turgy and remove them from the schema

erl -setcookie mega -sname killah -noshell -noinput -eval '
  "killah@" ++ Host = atom_to_list (node ()),
  rpc:call (list_to_atom ("flassy@" ++ Host), erlang, halt, []),
  rpc:call (list_to_atom ("turgy@" ++ Host), erlang, halt, []),
  { atomic, ok } = 
    rpc:call (list_to_atom ("warezy@" ++ Host), 
              mnesia,
              del_table_copy,
              [ schema, list_to_atom ("flassy@" ++ Host) ]),
  { atomic, ok } = 
    rpc:call (list_to_atom ("warezy@" ++ Host), 
              mnesia,
              del_table_copy,
              [ schema, list_to_atom ("turgy@" ++ Host) ])
' -s erlang halt

rm -rf Mnesia*flassy*
rm -rf Mnesia*turgy*

# now restart turgy

erl -setcookie mega -sname turgy -s mnesia -noshell -noinput -eval '
  true = register (hello, self ()),
  "turgy@" ++ Host = atom_to_list (node ()),
  pong = net_adm:ping (list_to_atom ("flassy@" ++ Host)),
  { ok, _ } = mnesia:change_config (extra_db_nodes, erlang:nodes ()),
  { atomic, ok } = mnesia:change_table_copy_type (schema,
                                                  node (),
                                                  disc_copies),
  receive { From, ruthere } -> From ! imok end,
  receive after infinity -> ok end
' -s erlang halt &

erl -setcookie mega -sname wazzup -noshell -noinput -eval '
  receive after 1000 -> ok end,
  "wazzup@" ++ Host = atom_to_list (node ()),
  pong = net_adm:ping (list_to_atom ("turgy@" ++ Host)),
  Pid = rpc:call (list_to_atom ("turgy@" ++ Host), erlang, whereis, [ hello ]),
  Pid ! { self (), ruthere },
  receive imok -> ok end
' -s erlang halt

# now restart flassy

erl -setcookie mega -sname flassy -s mnesia -noshell -noinput -eval '
  "flassy@" ++ Host = atom_to_list (node ()),
  pong = net_adm:ping (list_to_atom ("turgy@" ++ Host)),
  pong = net_adm:ping (list_to_atom ("warezy@" ++ Host)),
  { ok, _ } = mnesia:change_config (extra_db_nodes, erlang:nodes ()),
  { atomic, ok } = mnesia:change_table_copy_type (schema,
                                                  node (),
                                                  disc_copies),
  { atomic, ok } = mnesia:add_table_copy (flass, node (), ram_copies),
  mnesia:system_info ()
' -s erlang halt

erl -setcookie mega -sname killah -noshell -noinput -eval '
  "killah@" ++ Host = atom_to_list (node ()),
  rpc:call (list_to_atom ("flassy@" ++ Host), erlang, halt, []),
  rpc:call (list_to_atom ("turgy@" ++ Host), erlang, halt, []),
  rpc:call (list_to_atom ("warezy@" ++ Host), erlang, halt, [])
' -s erlang halt

rm -rf Mnesia*flassy*
rm -rf Mnesia*turgy*
rm -rf Mnesia*warezy*



More information about the erlang-questions mailing list