[erlang-bugs] bug in HiPE for <<_/utf8,...>>

Johannes Weißl jargon@REDACTED
Thu Apr 2 10:20:02 CEST 2015


A short update: The reported HiPE bug is still present and reproducible
in OTP 17.5 and 18.0-rc1.

We had to modify the 'crash.erl' program a little bit after commits
5a7b211 and 7b10ff7 (this shows how fragile it is!), so I'm attaching
the latest version:

MD5 (crash.erl) = 41d0e4e8ed7039dc898a16135aa62bcb
MD5 (crash_it.escript) = f7756e997d9ca28f6d523086e8c37f91
MD5 (data.jsn) = a0ee43e0e63aea6f3c89c41cc3b5d378

To reproduce the bug, save all three files in one directory, make
'crash_it.escript' executable and run it. It compiles 'crash.erl'
with HiPE enabled, and produces a crash, which does not occur without
HiPE. See also the 'Details' section below for more information about
the bug.

Johannes Weißl and Sebastian Egner

On Mon, Sep 09, 2013 at 02:20PM +0000, Sebastian Egner wrote:
> Hi,
> There seems to be a Heisenbug in HiPE related to matching <<_/utf8,...>>.
> After a long and bloody fight, we have been able to isolate the problem to the degree
> that it is sufficiently reproducible. See details below.
> We strongly suspect that the problem is a genuine bug related to the binary matching
> and the garbage collector. Whether the bug is hit depends on the memory contents
> of previously allocated heap-allocated binaries.
> Best regards,
> Johannes Weissl and Sebastian Egner.
> --
> Details:
> - The program 'crash.erl' loads a JSON sample file. Then it parses the file again and again,
>   and after a wildly varying number of iterations (100-100000) the parser fails.
> - To run the program, execute "crash_it" in a directory containing "crash.erl" and "data.jsn".
>   When the bug is hit, the program stops. This takes several seconds to minutes.
> - The problem manifests itself when <<"0123...">> does not match <<_/utf8,_/binary>>
>   in the function crash:check_utf8_binary/1. (The program aborts with an exception exit.)
> - Surprisingly, we have not been able to reduce the program even more.
>   In particular, when randomize_memory/0 is not called, the problem is much less frequent.
> - The bug is present in R13B02, R14B04, R16B01, "maint" (2f28245) and master (45eaf81).
> - The bug is present under MacOSX (10.8.4), Debian GNU/Linux and a Linux in an ARM emulator.
>   This indicates that the bug is not related to the operating system platform.
> - We have run the program in Valgrind and found conditionals that depend on uninitialised
>   values. Refer to "valgrind.out" for details.
> Attachments:
> MD5 (crash.erl) = 1f1507c8238e2136d9163314bcac0045
> MD5 (crash_it) = 4061276b89dfc822cbfc22002f202358
> MD5 (data.jsn) = c5b503cc61d76adc7dcb60832a123b99
> MD5 (valgrind.out) = 2e6f67bf06b3df66c6daf728444b9b66
%% The program loads a data file ("data.jsn") once and
%% decodes it as json subset repeatedly and checks the
%% strings to be UTF-8, which it is (even ASCII).
%% After some number of iterations, the check crashes.
%% Unfortunately, the behaviour does not reproduce well.
%% The rate of crashes is very sensitive to the contents
%% of the program. Also in some cases the process does not
%% seem to crash at all; we start several parallel Erlang
%% processes (instances) to increase the total probability
%% of a crash.


start(Instance) ->
    {ok,Data} = file:read_file("data.jsn"),
    loop_parse(1, Data, Instance).

loop_parse(N, Data, Instance) ->
    try   {ok,_} = parse(Data)
    catch {check_utf8_binary,Key} ->
            io:fwrite("; bug tripped in iteration ~p of instance ~p for Key = ~p.~n",
                      [N, Instance, Key]),
            %% Use with "+Mim true" in crash_it.escript for further information
            %% io:fwrite("instrument:memory_data() ->~n  ~p.~n", [instrument:memory_data()]),
    case N rem 100 == 0 orelse N == 1 of
        true -> io:fwrite("; ~b iterations OK of instance ~p.~n",
                          [N, Instance]);
        false -> ok
    loop_parse(N + 1, Data, Instance).

%% -- stripped down code from our application --

parse(Data) ->
    Lines = binary:split(Data, <<"\n">>, [global, trim]),
    fold_list([], Lines).

fold_list(List, [Utf8|Utf8s]) ->
    {ok,Json} = from_utf8(Utf8),
    fold_list([Json|List], Utf8s);
fold_list(List, []) ->

from_utf8(Utf8) ->
    {ok,Toks} = toks(Utf8, []),
    {ok,Value,[]} = from_toks(Toks),

from_toks([S|Toks]) when is_binary(S) -> {ok,S,Toks};
from_toks([lbrac|Toks]) -> array_from_toks(Toks, []);
from_toks([lpar|Toks]) -> object_from_toks(Toks, []).

array_from_toks(Toks, RevValues) ->
    case from_toks(Toks) of
        {ok,Value,[comma|Toks1]} -> array_from_toks(Toks1, [Value|RevValues]);
        {ok,Value,[rbrac|Toks1]} -> {ok,lists:reverse([Value|RevValues]),Toks1}

object_from_toks([K,colon|Toks1], RevKVs) when is_binary(K) ->
    case from_toks(Toks1) of
        {ok,V,[comma|Toks2]} ->
            object_from_toks(Toks2, [{K,V}|RevKVs]);
        {ok,V,[rpar|Toks2]} ->

toks(<<"[",T/binary>>, Toks) -> toks(T, [lbrac|Toks]);
toks(<<"]",T/binary>>, Toks) -> toks(T, [rbrac|Toks]);
toks(<<"{",T/binary>>, Toks) -> toks(T, [lpar|Toks]);
toks(<<"}",T/binary>>, Toks) -> toks(T, [rpar|Toks]);
toks(<<",",T/binary>>, Toks) -> toks(T, [comma|Toks]);
toks(<<":",T/binary>>, Toks) -> toks(T, [colon|Toks]);
toks(<<"\"",T/binary>>, Toks) -> toks_string(<<>>, T, Toks);
toks(<<>>, Toks) -> {ok,lists:reverse(Toks)}.

toks_string(Acc, <<"\"",T/binary>>, Toks) ->
    toks(T, [Acc|Toks]);
toks_string(Acc, <<C,T/binary>>, Toks) ->
    toks_string(<<Acc/binary,C>>, T, Toks).

obj(KVs) when is_list(KVs) ->
    {obj,from_list([{to_atom(K),V} || {K,V} <- KVs])}.

to_atom(Key) ->
    try   ok = check_utf8_binary(Key)
    catch error:function_clause -> throw({check_utf8_binary,Key})
    binary_to_atom(Key, utf8).

check_codepoint(Cp) ->
    %% Not all cases required for crash!
    %% The cases are split in separate if-expressions so that they are
    %% not optimized away by the compiler (since 7b10ff7, OTP 18).
    ok = if Cp < 0; Cp >= 16#110000      -> {error,codepoint,Cp};
            true -> ok
    ok = if 16#d800 =< Cp, Cp =< 16#dfff -> {error,codepoint,Cp};
            true -> ok
    ok = if 16#fdd0 =< Cp, Cp =< 16#fdef -> {error,codepoint,Cp};
            true -> ok
    ok = if Cp band 16#ffff >= 16#fffe   -> {error,codepoint,Cp};
            true -> ok

check_utf8_binary(<<Cp/utf8,More/binary>>) -> % (*)
    ok = check_codepoint(Cp),
check_utf8_binary(<<>>) ->

%% -- from orddict.erl before 5a7b211 --

from_list(Pairs) ->
    lists:foldl(fun ({K,V}, D) -> store(K, V, D) end, [], Pairs).

store(Key, New, [{K,_}=E|Dict]) when Key < K ->
store(Key, New, [{K,_}=E|Dict]) when Key > K ->
    [E|store(Key, New, Dict)];
store(Key, New, [{_K,_Old}|Dict]) ->
store(Key, New, []) -> [{Key,New}].
#!/usr/bin/env escript

%% The following enables memory tracing in the emulator:
%%! +Mim true

%% Run as:
%% rm -f crash.beam && ./crash_it.escript [timeout_seconds]

main(Args) ->
    make:all([native,{hipe,[verbose,o3]}]), % trips the bug
    %% make:all([]), % does not trip the bug

    io:fwrite("OTP ~s.~n", [erlang:system_info(otp_release)]),

    %% Sometimes a process won't trip the bug, regardless how many iterations,
    %% so we start a number of parallel processes to increase the probability
    %% that one of them trips the bug.
    %%    The bug seems to be related to a minor GC not yet executed. The GC
    %% settings below seem to increase the rate of tripping the bug.
    Instances = 10,
      fun (Instance) ->
              erlang:spawn_opt(crash, start, [Instance],
      lists:seq(1, Instances)),
    receive never -> ok end.

process_args([]) ->
process_args([TimeoutStr]) ->
    Timeout = list_to_integer(TimeoutStr) * 1000, % [ms]
    spawn_link(fun() -> halt_after(Timeout) end),
process_args(_) ->
              "usage: crash_it.escript [timeout_seconds]~n", []),

halt_after(Timeout) ->
    Status = 3,
    io:fwrite("bug not tripped after ~w ms, exit with status ~w~n",
              [Timeout, Status]),
