Getting inconsistent out put with map-reduce queries.

Naveen Tamanam naveen32india at
Wed Jan 22 07:39:40 EST 2014


I am running riak  of  version  riak-1.3.2,  on CentOS release 6.4.  Using
the riak with the  django configured with apache. Apache is configured to
run different python processes.
I am running 5 node cluster  on the same machine.

I am retrieving the  required data with the  map-reduce queries.  Thing is
that  I am getting inconsistence out put each time.
Some times not getting enough data as expected, some times getting.

I tested by logging into the my web app from different (2 )locations.  The
requests are getting served by different python process by apache. And both
request processes are connected to the one riak node only. At one location
I got the data expected and in other location I did not.

Please let me know any  precautions that I need to take in the
configuration, and precautions need to be taken when running map reduce.
I would like to know recommended configuration values for better
performance and usage.

How can I get the  consistent out put each time from the map reduce..?

Here is my riak  app.config

%% -*- mode: erlang;erlang-indent-level: 4;indent-tabs-mode: nil -*-
%% ex: ft=erlang ts=4 sw=4 et
 %% Riak Client APIs config
 {riak_api, [
            %% pb_backlog is the maximum length to which the queue of pending
            %% connections may grow. If set, it must be an integer >= 0.
            %% By default the value is 5. If you anticipate a huge number of
            %% connections being initialised *simultaneously*, set this number
            %% higher.
            %% {pb_backlog, 64},

            %% pb_ip is the IP address that the Riak Protocol Buffers interface
            %% will bind to.  If this is undefined, the interface will not run.
            {pb_ip,   "" },

            %% pb_port is the TCP port that the Riak Protocol Buffers interface
            %% will bind to
            {pb_port, 10017 }

 %% Riak Core config
 {riak_core, [
              %% Default location of ringstate
              {ring_state_dir, "./data/ring"},

              %% Default ring creation size.  Make sure it is a power of 2,
              %% e.g. 16, 32, 64, 128, 256, 512 etc
              %{ring_creation_size, 64},

              %% http is a list of IP addresses and TCP ports that the Riak
              %% HTTP interface will bind.
              {http, [ {"", 10018 } ]},

              %% https is a list of IP addresses and TCP ports that the Riak
              %% HTTPS interface will bind.
              %{https, [{ "", 10018 }]},

              %% Default cert and key locations for https can be overridden
              %% with the ssl config variable, for example:
              %{ssl, [
              %       {certfile, "./etc/cert.pem"},
              %       {keyfile, "./etc/key.pem"}
              %      ]},

              %% riak_handoff_port is the TCP port that Riak uses for
              %% intra-cluster data handoff.
              {handoff_port, 10019 },

              %% To encrypt riak_core intra-cluster data handoff traffic,
              %% uncomment the following line and edit its path to an
              %% appropriate certfile and keyfile.  (This example uses a
              %% single file with both items concatenated together.)
              %{handoff_ssl_options, [{certfile, "/tmp/erlserver.pem"}]},

              %% DTrace support
              %% Do not enable 'dtrace_support' unless your Erlang/OTP
              %% runtime is compiled to support DTrace.  DTrace is
              %% available in R15B01 (supported by the Erlang/OTP
              %% official source package) and in R14B04 via a custom
              %% source repository & branch.
              {dtrace_support, false},

              %% Health Checks
              %% If disabled, health checks registered by an application will
              %% be ignored. NOTE: this option cannot be changed at runtime.
              %% To re-enable, the setting must be changed and the
node restarted.
              %% NOTE: As of Riak 1.3.2, health checks are deprecated as they
              %% may interfere with the new overload protection mechanisms.
              %% If there is a good reason to re-enable them, you must uncomment
              %% this line and also add an entry in the riak_kv section:
              %%          {riak_kv, [ ..., {enable_health_checks, true}, ...]}
              %% {enable_health_checks, true},

              %% Platform-specific installation paths (substituted by rebar)
              {platform_bin_dir, "./bin"},
              {platform_data_dir, "./data"},
              {platform_etc_dir, "./etc"},
              {platform_lib_dir, "./lib"},
              {platform_log_dir, "./log"}

 %% Riak KV config
 {riak_kv, [
            %% Storage_backend specifies the Erlang module defining the storage
            %% mechanism that will be used on this node.
            {storage_backend, riak_kv_bitcask_backend},

            %% raw_name is the first part of all URLS used by the Riak raw HTTP
            %% interface.  See riak_web.erl and raw_http_resource.erl for
            %% details.
            %{raw_name, "riak"},

            %% Enable active anti-entropy subsystem + optional debug messages:
            %%   {anti_entropy, {on|off, []}},
            %%   {anti_entropy, {on|off, [debug]}},
            {anti_entropy, {on, []}},

            %% Restrict how fast AAE can build hash trees. Building the tree
            %% for a given partition requires a full scan over that partition's
            %% data. Once built, trees stay built until they are expired.
            %% Config is of the form:
            %%   {num-builds, per-timespan-in-milliseconds}
            %% Default is 1 build per hour.
            {anti_entropy_build_limit, {1, 3600000}},

            %% Determine how often hash trees are expired after being built.
            %% Periodically expiring a hash tree ensures the on-disk hash tree
            %% data stays consistent with the actual k/v backend data. It also
            %% helps Riak identify silent disk failures and bit rot. However,
            %% expiration is not needed for normal AAE operation and should be
            %% infrequent for performance reasons. The time is specified in
            %% milliseconds. The default is 1 week.
            {anti_entropy_expire, 604800000},

            %% Limit how many AAE exchanges/builds can happen concurrently.
            {anti_entropy_concurrency, 2},

            %% The tick determines how often the AAE manager looks for work
            %% to do (building/expiring trees, triggering exchanges, etc).
            %% The default is every 15 seconds. Lowering this value will
            %% speedup the rate that all replicas are synced across the cluster.
            %% Increasing the value is not recommended.
            {anti_entropy_tick, 15000},

            %% The directory where AAE hash trees are stored.
            {anti_entropy_data_dir, "./data/anti_entropy"},

            %% The LevelDB options used by AAE to generate the LevelDB-backed
            %% on-disk hashtrees.
            {anti_entropy_leveldb_opts, [{write_buffer_size, 4194304},
                                         {max_open_files, 20}]},

            %% mapred_name is URL used to submit map/reduce requests to Riak.
            {mapred_name, "mapred"},

            %% mapred_2i_pipe indicates whether secondary-index
            %% MapReduce inputs are queued in parallel via their own
            %% pipe ('true'), or serially via a helper process
            %% ('false' or undefined).  Set to 'false' or leave
            %% undefined during a rolling upgrade from 1.0.
            {mapred_2i_pipe, true},

            %% Each of the following entries control how many Javascript
            %% virtual machines are available for executing map, reduce,
            %% pre- and post-commit hook functions.
            {map_js_vm_count, 18 },
            {reduce_js_vm_count, 46 },
            {hook_js_vm_count, 32 },

            %% js_max_vm_mem is the maximum amount of memory, in megabytes,
            %% allocated to the Javascript VMs. If unset, the default is
            %% 8MB.
            {js_max_vm_mem, 8},

            %% js_thread_stack is the maximum amount of thread stack,
in megabyes,
            %% allocate to the Javascript VMs. If unset, the default is 16MB.
            %% NOTE: This is not the same as the C thread stack.
            {js_thread_stack, 16},

            %% js_source_dir should point to a directory containing Javascript
            %% source files which will be loaded by Riak when it initializes
            %% Javascript VMs.
            %{js_source_dir, "/tmp/js_source"},

            %% http_url_encoding determines how Riak treats URL encoded
            %% buckets, keys, and links over the REST API. When set to 'on'
            %% Riak always decodes encoded values sent as URLs and Headers.
            %% Otherwise, Riak defaults to compatibility mode where links
            %% are decoded, but buckets and keys are not. The compatibility
            %% mode will be removed in a future release.
            {http_url_encoding, on},

            %% Switch to vnode-based vclocks rather than client ids.  This
            %% significantly reduces the number of vclock entries.
            %% Only set true if *all* nodes in the cluster are upgraded to 1.0
            {vnode_vclocks, true},

            %% This option toggles compatibility of keylisting with 1.0
            %% and earlier versions.  Once a rolling upgrade to a version
            %% > 1.0 is completed for a cluster, this should be set to
            %% true for better control of memory usage during key listing
            %% operations
            {listkeys_backpressure, true},

            %% This option specifies how many of each type of fsm may exist
            %% concurrently.  This is for overload protection and is a new
            %% mechanism that obsoletes 1.3's health checks. Note that
this number
            %% represents two potential processes, so +P in vm.args
should be at
            %% least 3X the fsm_limit.
            {fsm_limit, 50000}

 %% Riak Search Config
 {riak_search, [
                %% To enable Search functionality set this 'true'.
                {enabled, false}

 %% Merge Index Config
 {merge_index, [
                %% The root dir to store search merge_index data
                {data_root, "./data/merge_index"},

                %% Size, in bytes, of the in-memory buffer.  When this
                %% threshold has been reached the data is transformed
                %% into a segment file which resides on disk.
                {buffer_rollover_size, 1048576},

                %% Overtime the segment files need to be compacted.
                %% This is the maximum number of segments that will be
                %% compacted at once.  A lower value will lead to
                %% quicker but more frequent compactions.
                {max_compact_segments, 20}

 %% Bitcask Config
 {bitcask, [
             %% Configure how Bitcask writes data to disk.
             %%   erlang: Erlang's built-in file API
             %%      nif: Direct calls to the POSIX C API
             %% The NIF mode provides higher throughput for certain
             %% workloads, but has the potential to negatively impact
             %% the Erlang VM, leading to higher worst-case latencies
             %% and possible throughput collapse.
             {io_mode, erlang},

             {data_root, "./data/bitcask"}

 %% eLevelDB Config
 {eleveldb, [
             {data_root, "./data/leveldb"}

 %% Lager Config
 {lager, [
            %% What handlers to install with what arguments
            %% The defaults for the logfiles are to rotate the files when
            %% they reach 10Mb or at midnight, whichever comes first, and keep
            %% the last 5 rotations. See the lager README for a description of
            %% the time rotation format:
            %% If you wish to disable rotation, you can either set the size to 0
            %% and the rotation time to "", or instead specify a
2-tuple that only
            %% consists of {Logfile, Level}.
            %% If you wish to have riak log messages to syslog, you
can use a handler
            %% like this:
            %%   {lager_syslog_backend, ["riak", daemon, info]},
            {handlers, [
                           {lager_console_backend, info},
                           {lager_file_backend, [
                               {"./log/error.log", error, 10485760, "$D0", 5},
                               {"./log/console.log", info, 10485760, "$D0", 5}
                       ] },

            %% Whether to write a crash log, and where.
            %% Commented/omitted/undefined means no crash logger.
            {crash_log, "./log/crash.log"},

            %% Maximum size in bytes of events in the crash log -
defaults to 65536
            {crash_log_msg_size, 65536},

            %% Maximum size of the crash log in bytes, before its rotated, set
            %% to 0 to disable rotation - default is 0
            {crash_log_size, 10485760},

            %% What time to rotate the crash log - default is no time
            %% rotation. See the lager README for a description of this format:
            {crash_log_date, "$D0"},

            %% Number of rotated crash logs to keep, 0 means keep only the
            %% current one - default is 0
            {crash_log_count, 5},

            %% Whether to redirect error_logger messages into lager -
defaults to true
            {error_logger_redirect, true}

 %% riak_sysmon config
 {riak_sysmon, [
         %% To disable forwarding events of a particular type, use a
         %% limit of 0.
         {process_limit, 30},
         {port_limit, 2},

         %% Finding reasonable limits for a given workload is a matter
         %% of experimentation.
         %% NOTE: Enabling the 'gc_ms_limit' monitor (by setting non-zero)
         %%       can cause performance problems on multi-CPU systems.
         {gc_ms_limit, 0},
         {heap_word_limit, 40111000},

         %% Configure the following items to 'false' to disable logging
         %% of that event type.
         {busy_port, true},
         {busy_dist_port, true}

 %% SASL config
 {sasl, [
         {sasl_error_logger, false}

 %% riak_control config
 {riak_control, [
                %% Set to false to disable the admin panel.
                {enabled, false},

                %% Authentication style used for access to the admin
                %% panel. Valid styles are 'userlist' <TODO>.
                {auth, userlist},

                %% If auth is set to 'userlist' then this is the
                %% list of usernames and passwords for access to the
                %% admin panel.
                {userlist, [{"user", "pass"}

                %% The admin panel is broken up into multiple
                %% components, each of which is enabled or disabled
                %% by one of these settings.
                {admin, true}

Thanks & Regards,
Naveen Tamanam
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <>

More information about the riak-users mailing list