From c92a6e3dade263e44f676290e32c33373942b3ef Mon Sep 17 00:00:00 2001 From: Craig Peterson Date: Tue, 1 Sep 2015 11:09:03 -0600 Subject: [PATCH 1/2] Retrying tsdb queries up to 3 times. --- cmd/bosun/cache/cache.go | 17 ++++++----------- cmd/bosun/expr/funcs.go | 24 +++++++++++++++++------- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/cmd/bosun/cache/cache.go b/cmd/bosun/cache/cache.go index 2990707039..2eda5e4068 100644 --- a/cmd/bosun/cache/cache.go +++ b/cmd/bosun/cache/cache.go @@ -14,12 +14,6 @@ type Cache struct { lru *lru.Cache } -// obj is an LRU object tracking data and a corresponding error. -type obj struct { - Val interface{} - Err error -} - func New(MaxEntries int) *Cache { return &Cache{ lru: lru.New(MaxEntries), @@ -34,16 +28,17 @@ func (c *Cache) Get(key string, getFn func() (interface{}, error)) (interface{}, result, ok := c.lru.Get(key) c.Unlock() if ok { - res := result.(*obj) - return res.Val, res.Err + return result, nil } // our lock only serves to protect the lru. // we can (and should!) do singleflight requests concurently return c.g.Do(key, func() (interface{}, error) { v, err := getFn() - c.Lock() - c.lru.Add(key, &obj{v, err}) - c.Unlock() + if err == nil { + c.Lock() + c.lru.Add(key, v) + c.Unlock() + } return v, err }) } diff --git a/cmd/bosun/expr/funcs.go b/cmd/bosun/expr/funcs.go index e95982be98..cc84d29b02 100644 --- a/cmd/bosun/expr/funcs.go +++ b/cmd/bosun/expr/funcs.go @@ -848,6 +848,8 @@ func timeGraphiteRequest(e *State, T miniprofiler.Timer, req *graphite.Request) return } +const tsdbMaxTries = 3 + func timeTSDBRequest(e *State, T miniprofiler.Timer, req *opentsdb.Request) (s opentsdb.ResponseSet, err error) { e.tsdbQueries = append(e.tsdbQueries, *req) if e.autods > 0 { @@ -860,14 +862,22 @@ func timeTSDBRequest(e *State, T miniprofiler.Timer, req *opentsdb.Request) (s o } } b, _ := json.MarshalIndent(req, "", " ") - T.StepCustomTiming("tsdb", "query", string(b), func() { - getFn := func() (interface{}, error) { - return e.tsdbContext.Query(req) + tries := 1 + for { + T.StepCustomTiming("tsdb", "query", string(b), func() { + getFn := func() (interface{}, error) { + return e.tsdbContext.Query(req) + } + var val interface{} + val, err = e.cache.Get(string(b), getFn) + s = val.(opentsdb.ResponseSet).Copy() + + }) + if err == nil || tries == tsdbMaxTries { + break } - var val interface{} - val, err = e.cache.Get(string(b), getFn) - s = val.(opentsdb.ResponseSet).Copy() - }) + tries++ + } return } From 319009d6e079e8555cd314e7ca4d74e761ed194c Mon Sep 17 00:00:00 2001 From: Craig Peterson Date: Tue, 1 Sep 2015 11:11:51 -0600 Subject: [PATCH 2/2] logging --- cmd/bosun/expr/funcs.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmd/bosun/expr/funcs.go b/cmd/bosun/expr/funcs.go index cc84d29b02..18979caee9 100644 --- a/cmd/bosun/expr/funcs.go +++ b/cmd/bosun/expr/funcs.go @@ -15,6 +15,7 @@ import ( "bosun.org/cmd/bosun/expr/parse" "bosun.org/graphite" "bosun.org/opentsdb" + "bosun.org/slog" ) func logstashTagQuery(args []parse.Node) (parse.Tags, error) { @@ -876,6 +877,7 @@ func timeTSDBRequest(e *State, T miniprofiler.Timer, req *opentsdb.Request) (s o if err == nil || tries == tsdbMaxTries { break } + slog.Errorf("Error on tsdb query %d: %s", tries, err.Error()) tries++ } return