Issue with Riak partition allocation

Luke Bakken lbakken at basho.com
Tue Sep 9 10:22:26 EDT 2014


Hi Peter,

Could you please provide the output of "riak-admin transfers" ?
--
Luke Bakken
Engineer / CSE
lbakken at basho.com


On Mon, Sep 8, 2014 at 10:01 AM, Peter Bakkum <peter at quizlet.com> wrote:
> Hey all,
>
> Looking for some guidance on a problem we're seeing in production right now.
> We're not Riak experts so please bear with us.
>
> We had a member of our 6-node Riak cluster appear to fall out (riak-admin
> member status on that node only showed itself). So I ran a riak-admin join
> and riak-admin commit to get the node back in the cluster. Node discovery
> appears to work now, but for some reason that node is now using a huge
> amount of disk space. It appears that the partition balancing process is
> creating this condition, and still hasn't completed after ~16 hours. The
> cluster is still functional and serving our production traffic, and taking
> the entire cluster offline isn't an option for us.
>
> Most of our nodes use about 450GB of space, this node in particular is using
> around 1.2TB, which is pushing the limit of its disk.
>
> Questions:
> Whats happening here? Is this expected?
>
> Whats the best course of action? Should we clear out this node and attempt
> to join the cluster again?
>
> Here are some stats from the node in question. Let me know if anything else
> would be helpful.
>
> Thanks for your help.
>
>
> [root at 192.168.72.19 /data/lib/riak] # riak-admin member-status
> ================================= Membership
> ==================================
> Status     Ring    Pending    Node
> -------------------------------------------------------------------------------
> valid      20.3%     16.4%    'xxxx_prod_cluster at 192.168.72.135'
> valid      18.0%     17.2%    'xxxx_prod_cluster at 192.168.72.170'
> valid      20.3%     17.2%    'xxxx_prod_cluster at 192.168.72.176'
> valid       7.0%     16.4%    'xxxx_prod_cluster at 192.168.72.19'
> valid      17.2%     16.4%    'xxxx_prod_cluster at 192.168.72.7'
> valid      17.2%     16.4%    'xxxx_prod_cluster at 192.168.72.74'
>
>
> [root at 192.168.72.19 /data/lib/riak] # riak-admin status
> 1-minute stats for 'xxxx_prod_cluster at 192.168.72.19'
> -------------------------------------------
> riak_kv_stat_ts : 1410194287
> vnode_gets : 1607
> vnode_gets_total : 563683
> vnode_puts : 39
> vnode_puts_total : 5459724
> vnode_index_refreshes : 0
> vnode_index_refreshes_total : 0
> vnode_index_reads : 0
> vnode_index_reads_total : 0
> vnode_index_writes : 39
> vnode_index_writes_total : 5459724
> vnode_index_writes_postings : 0
> vnode_index_writes_postings_total : 5227558
> vnode_index_deletes : 0
> vnode_index_deletes_total : 0
> vnode_index_deletes_postings : 39
> vnode_index_deletes_postings_total : 30613
> node_gets : 3602
> node_gets_total : 2463956
> node_get_fsm_siblings_mean : 1
> node_get_fsm_siblings_median : 1
> node_get_fsm_siblings_95 : 2
> node_get_fsm_siblings_99 : 3
> node_get_fsm_siblings_100 : 12
> node_get_fsm_objsize_mean : 52047
> node_get_fsm_objsize_median : 26936
> node_get_fsm_objsize_95 : 167435
> node_get_fsm_objsize_99 : 267979
> node_get_fsm_objsize_100 : 1313716
> node_get_fsm_time_mean : 12223
> node_get_fsm_time_median : 6675
> node_get_fsm_time_95 : 37390
> node_get_fsm_time_99 : 87046
> node_get_fsm_time_100 : 345380
> node_puts : 39
> node_puts_total : 24915
> node_put_fsm_time_mean : 4419
> node_put_fsm_time_median : 2444
> node_put_fsm_time_95 : 12890
> node_put_fsm_time_99 : 18775
> node_put_fsm_time_100 : 18775
> read_repairs : 0
> read_repairs_total : 0
> coord_redirs_total : 17022
> executing_mappers : 0
> precommit_fail : 0
> postcommit_fail : 0
> index_fsm_create : 0
> index_fsm_create_error : 0
> index_fsm_active : 0
> list_fsm_create : 0
> list_fsm_create_error : 0
> list_fsm_active : 0
> pbc_active : 0
> pbc_connects : 1
> pbc_connects_total : 508
> node_get_fsm_active : 1
> node_get_fsm_active_60s : 3530
> node_get_fsm_in_rate : 55
> node_get_fsm_out_rate : 56
> node_get_fsm_rejected : 0
> node_get_fsm_rejected_60s : 0
> node_get_fsm_rejected_total : 0
> node_put_fsm_active : 0
> node_put_fsm_active_60s : 67
> node_put_fsm_in_rate : 1
> node_put_fsm_out_rate : 1
> node_put_fsm_rejected : 0
> node_put_fsm_rejected_60s : 0
> node_put_fsm_rejected_total : 0
> leveldb_read_block_error : 0
> riak_pipe_stat_ts : 1410194286
> pipeline_active : 0
> pipeline_create_count : 0
> pipeline_create_one : 0
> pipeline_create_error_count : 0
> pipeline_create_error_one : 0
> cpu_nprocs : 426
> cpu_avg1 : 1352
> cpu_avg5 : 1260
> cpu_avg15 : 1137
> mem_total : 15666507776
> mem_allocated : 15479640064
> disk : [{"/",8256952,60},
>         {"/dev/shm",7649660,0},
>         {"/tmpfs",1048576,14},
>         {"/tmpfs_mp3",1048576,0},
>         {"/data",1514123712,81}]
> nodename : 'xxxx_prod_cluster at 192.168.72.19'
> connected_nodes : ['xxxx_prod_cluster at 192.168.72.170',
>                    'xxxx_prod_cluster at 192.168.72.176',
>                    'xxxx_prod_cluster at 192.168.72.74',
>                    'xxxx_prod_cluster at 192.168.72.135',
>                    'xxxx_prod_cluster at 192.168.72.7']
> sys_driver_version : <<"2.0">>
> sys_global_heaps_size : 0
> sys_heap_type : private
> sys_logical_processors : 4
> sys_otp_release : <<"R15B01">>
> sys_process_count : 2469
> sys_smp_support : true
> sys_system_version : <<"Erlang R15B01 (erts-5.9.1) [source] [64-bit]
> [smp:4:4] [async-threads:64] [kernel-poll:true]">>
> sys_system_architecture : <<"x86_64-unknown-linux-gnu">>
> sys_threads_enabled : true
> sys_thread_pool_size : 64
> sys_wordsize : 8
> ring_members : ['xxxx_prod_cluster at 192.168.72.135',
>                 'xxxx_prod_cluster at 192.168.72.170',
>                 'xxxx_prod_cluster at 192.168.72.176',
>                 'xxxx_prod_cluster at 192.168.72.19',
>                 'xxxx_prod_cluster at 192.168.72.7',
>                 'xxxx_prod_cluster at 192.168.72.74']
> ring_num_partitions : 128
> ring_ownership : <<"[{'xxxx_prod_cluster at 192.168.72.170',23},\n
> {'xxxx_prod_cluster at 192.168.72.74',22},\n
> {'xxxx_prod_cluster at 192.168.72.135',26},\n
> {'xxxx_prod_cluster at 192.168.72.176',26},\n
> {'xxxx_prod_cluster at 192.168.72.7',22},\n
> {'xxxx_prod_cluster at 192.168.72.19',9}]">>
> ring_creation_size : 128
> storage_backend : riak_kv_eleveldb_backend
> erlydtl_version : <<"0.7.0">>
> riak_control_version : <<"1.4.10-0-g73c43c3">>
> cluster_info_version : <<"1.2.4">>
> riak_search_version : <<"1.4.10-0-g6e548e7">>
> merge_index_version : <<"1.3.2-0-gcb38ee7">>
> riak_kv_version : <<"1.4.10-0-g64b6ad8">>
> sidejob_version : <<"0.2.0">>
> riak_api_version : <<"1.4.10-0-gc407ac0">>
> riak_pipe_version : <<"1.4.10-0-g9353526">>
> riak_core_version : <<"1.4.10">>
> bitcask_version : <<"1.6.6-0-g230b6d6">>
> basho_stats_version : <<"1.0.3">>
> webmachine_version : <<"1.10.4-0-gfcff795">>
> mochiweb_version : <<"1.5.1p6">>
> inets_version : <<"5.9">>
> erlang_js_version : <<"1.2.2">>
> runtime_tools_version : <<"1.8.8">>
> os_mon_version : <<"2.2.9">>
> riak_sysmon_version : <<"1.1.3">>
> ssl_version : <<"5.0.1">>
> public_key_version : <<"0.15">>
> crypto_version : <<"2.1">>
> sasl_version : <<"2.2.1">>
> lager_version : <<"2.0.1">>
> goldrush_version : <<"0.1.5">>
> compiler_version : <<"4.8.1">>
> syntax_tools_version : <<"1.6.8">>
> stdlib_version : <<"1.18.1">>
> kernel_version : <<"2.15.1">>
> memory_total : 130705264
> memory_processes : 55557705
> memory_processes_used : 55341757
> memory_system : 75147559
> memory_atom : 545377
> memory_atom_used : 527226
> memory_binary : 12172712
> memory_code : 11674242
> memory_ets : 11913912
>
>
>
> _______________________________________________
> riak-users mailing list
> riak-users at lists.basho.com
> http://lists.basho.com/mailman/listinfo/riak-users_lists.basho.com
>




More information about the riak-users mailing list