Track open_shard timeouts with a counter 74/head
authorPaul J. Davis <paul.joseph.davis@gmail.com>
Fri, 11 Nov 2016 17:34:11 +0000 (11:34 -0600)
committerPaul J. Davis <paul.joseph.davis@gmail.com>
Fri, 11 Nov 2016 17:38:00 +0000 (11:38 -0600)
The open_shard RPC endpoint is used to grab security docs. There are
fairly aggressive timeouts on these requests so that when a node is too
busy it'll try the next shard. Rather than log everytime these fail
(which can be substantial under load) lets just use a counter that can
be graphed and alerted on.

COUCHDB-3234

priv/stats_descriptions.cfg
src/fabric_rpc.erl

index 0c2351b..d12aa0c 100644 (file)
@@ -2,6 +2,10 @@
     {type, counter},
     {desc, <<"number of worker timeouts">>}
 ]}.
+{[fabric, open_shard, timeouts], [
+    {type, counter},
+    {desc, <<"number of open shard timeouts">>}
+]}.
 {[fabric, read_repairs, success], [
     {type, counter},
     {desc, <<"number of successful read repair operations">>}
index 3a21d83..9c7d518 100644 (file)
@@ -253,7 +253,11 @@ reset_validation_funs(DbName) ->
 
 open_shard(Name, Opts) ->
     set_io_priority(Name, Opts),
-    rexi:reply(couch_db:open(Name, Opts)).
+    try
+        rexi:reply(couch_db:open(Name, Opts))
+    catch exit:{timeout, _} ->
+        couch_stats:increment_counter([fabric, open_shard, timeouts])
+    end.
 
 compact(DbName) ->
     with_db(DbName, [], {couch_db, start_compact, []}).