这是indexloc提供的服务,不要输入任何密码
Skip to content
This repository was archived by the owner on Feb 13, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions cmd/bosun/sched/bolt.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ const (
dbMetadata = "metadata"
dbMetricMetadata = "metadata-metric"
dbIncidents = "incidents"
dbErrors = "errors"
)

func (s *Schedule) save() {
Expand All @@ -67,8 +68,9 @@ func (s *Schedule) save() {
dbSilence: s.Silence,
dbStatus: s.status,
dbMetadata: s.Metadata,
dbIncidents: s.Incidents,
dbMetricMetadata: s.metricMetadata,
dbIncidents: s.Incidents,
dbErrors: s.AlertStatuses,
}
tostore := make(map[string][]byte)
for name, data := range store {
Expand Down Expand Up @@ -181,6 +183,9 @@ func (s *Schedule) RestoreState() error {
if err := decode(dbIncidents, &s.Incidents); err != nil {
slog.Errorln(dbIncidents, err)
}
if err := decode(dbErrors, &s.AlertStatuses); err != nil {
slog.Errorln(dbErrors, err)
}

// Calculate next incident id.
for _, i := range s.Incidents {
Expand Down Expand Up @@ -216,11 +221,16 @@ func (s *Schedule) RestoreState() error {
}
}
clear(st.Result)
newHistory := []Event{}
for _, e := range st.History {
clear(e.Warn)
clear(e.Crit)
clear(e.Error)
// Remove error events which no longer are a thing.
if e.Status <= StUnknown {
newHistory = append(newHistory, e)
}
}
st.History = newHistory
s.status[ak] = st
if a.Log && st.Open {
st.Open = false
Expand Down
30 changes: 7 additions & 23 deletions cmd/bosun/sched/check.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,7 @@ func (s *Schedule) runHistory(r *RunHistory, ak expr.AlertKey, event *Event, sil
// make sure we always touch the state.
state.Touched = r.Start
// set state.Result according to event result
if event.Error != nil {
state.Result = event.Error
} else if event.Crit != nil {
if event.Crit != nil {
state.Result = event.Crit
} else if event.Warn != nil {
state.Result = event.Warn
Expand Down Expand Up @@ -407,7 +405,7 @@ func (s *Schedule) findUnknownAlerts(now time.Time, alert string) []expr.AlertKe
s.Lock("FindUnknown")
for ak, st := range s.status {
name := ak.Name()
if name != alert || st.Forgotten || st.Status() == StError {
if name != alert || st.Forgotten || !s.AlertSuccessful(ak.Name()) {
continue
}
a := s.Conf.Alerts[name]
Expand Down Expand Up @@ -437,13 +435,16 @@ func (s *Schedule) CheckAlert(T miniprofiler.Timer, r *RunHistory, a *conf.Alert
deps = filterDependencyResults(d)
crits, err = s.CheckExpr(T, r, a, a.Crit, StCritical, nil)
if err == nil {
warns, _ = s.CheckExpr(T, r, a, a.Warn, StWarning, crits)
warns, err = s.CheckExpr(T, r, a, a.Warn, StWarning, crits)
}
}
unevalCount, unknownCount := markDependenciesUnevaluated(r.Events, deps, a.Name)
if err != nil {
slog.Errorf("Error checking alert %s: %s", a.Name, err.Error())
removeUnknownEvents(r.Events, a.Name)
s.markAlertError(a.Name, err)
} else {
s.markAlertSuccessful(a.Name)
}
collect.Put("check.duration", opentsdb.TagSet{"name": a.Name}, time.Since(start).Seconds())
slog.Infof("check alert %v done (%s): %v crits, %v warns, %v unevaluated, %v unknown", a.Name, time.Since(start), len(crits), len(warns), unevalCount, unknownCount)
Expand Down Expand Up @@ -502,23 +503,6 @@ func (s *Schedule) executeExpr(T miniprofiler.Timer, rh *RunHistory, a *conf.Ale
return nil, nil
}
results, _, err := e.Execute(rh.Context, rh.GraphiteContext, rh.Logstash, rh.InfluxHost, rh.Cache, T, rh.Start, 0, a.UnjoinedOK, s.Search, s.Conf.AlertSquelched(a), rh)
if err != nil {
ak := expr.NewAlertKey(a.Name, nil)
rh.Events[ak] = &Event{
Status: StError,
Error: &Result{
Result: &expr.Result{
Computations: []expr.Computation{
{
Text: e.String(),
Value: err.Error(),
},
},
},
},
}
return nil, err
}
return results, err
}

Expand Down Expand Up @@ -575,7 +559,7 @@ Loop:
}
status := checkStatus
if math.IsNaN(n) {
status = StError
status = checkStatus
} else if n == 0 {
status = StNormal
}
Expand Down
2 changes: 0 additions & 2 deletions cmd/bosun/sched/filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,6 @@ func makeFilter(filter string) (func(*conf.Conf, *conf.Alert, *State) bool, erro
v = StWarning
case "critical":
v = StCritical
case "error":
v = StError
case "unknown":
v = StUnknown
default:
Expand Down
160 changes: 142 additions & 18 deletions cmd/bosun/sched/sched.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ type Schedule struct {
Incidents map[uint64]*Incident
Search *search.Search

AlertStatuses map[string]*AlertStatus

//channel signals an alert has added notifications, and notifications should be processed.
nc chan interface{}
//notifications to be sent immediately
Expand All @@ -58,11 +60,12 @@ type Schedule struct {
//unknown states that need to be notified about. Collected and sent in batches.
pendingUnknowns map[*conf.Notification][]*State

metaLock sync.Mutex
metricMetaLock sync.Mutex
maxIncidentId uint64
incidentLock sync.Mutex
db *bolt.DB
metaLock sync.Mutex
metricMetaLock sync.Mutex
alertStatusLock sync.Mutex
maxIncidentId uint64
incidentLock sync.Mutex
db *bolt.DB

LastCheck time.Time

Expand All @@ -72,6 +75,7 @@ type Schedule struct {
func (s *Schedule) Init(c *conf.Conf) error {
var err error
s.Conf = c
s.AlertStatuses = make(map[string]*AlertStatus)
s.Silence = make(map[string]*Silence)
s.Group = make(map[time.Time]expr.AlertKeys)
s.Metadata = make(map[metadata.Metakey]*Metavalue)
Expand Down Expand Up @@ -353,6 +357,7 @@ type StateGroup struct {
Active bool `json:",omitempty"`
Status Status
Silenced bool
IsError bool `json:",omitempty"`
Subject string `json:",omitempty"`
Alert string `json:",omitempty"`
AlertKey expr.AlertKey `json:",omitempty"`
Expand All @@ -366,7 +371,8 @@ type StateGroups struct {
NeedAck []*StateGroup `json:",omitempty"`
Acknowledged []*StateGroup `json:",omitempty"`
}
TimeAndDate []int
TimeAndDate []int
FailingAlerts, UnclosedErrors int
}

func (s *Schedule) MarshalGroups(T miniprofiler.Timer, filter string) (*StateGroups, error) {
Expand All @@ -380,6 +386,7 @@ func (s *Schedule) MarshalGroups(T miniprofiler.Timer, filter string) (*StateGro
t := StateGroups{
TimeAndDate: s.Conf.TimeAndDate,
}
t.FailingAlerts, t.UnclosedErrors = s.getErrorCounts()
s.Lock("MarshallGroups")
defer s.Unlock()
T.Step("Setup", func(miniprofiler.Timer) {
Expand Down Expand Up @@ -414,7 +421,7 @@ func (s *Schedule) MarshalGroups(T miniprofiler.Timer, filter string) (*StateGro
for tuple, states := range groups {
var grouped []*StateGroup
switch tuple.Status {
case StWarning, StCritical, StUnknown, StError:
case StWarning, StCritical, StUnknown:
var sets map[string]expr.AlertKeys
T.Step(fmt.Sprintf("GroupSets (%d): %v", len(states), tuple), func(T miniprofiler.Timer) {
sets = states.GroupSets()
Expand Down Expand Up @@ -447,6 +454,7 @@ func (s *Schedule) MarshalGroups(T miniprofiler.Timer, filter string) (*StateGro
Subject: string(st.Subject),
Ago: marshalTime(st.Last().Time),
State: st,
IsError: !s.AlertSuccessful(ak.Name()),
})
}
if len(g.Children) == 1 && g.Children[0].Subject != "" {
Expand Down Expand Up @@ -683,7 +691,6 @@ func (s *Schedule) Action(user, message string, t ActionType, ak expr.AlertKey)
st.NeedAck = false
}
isUnknown := st.AbnormalStatus() == StUnknown
isError := st.AbnormalStatus() == StError
timestamp := time.Now().UTC()
switch t {
case ActionAcknowledge:
Expand All @@ -698,7 +705,7 @@ func (s *Schedule) Action(user, message string, t ActionType, ak expr.AlertKey)
if st.NeedAck {
ack()
}
if st.IsActive() && !isError {
if st.IsActive() {
return fmt.Errorf("cannot close active alert")
}
st.Open = false
Expand Down Expand Up @@ -755,11 +762,11 @@ func (s *State) Last() Event {
}

type Event struct {
Warn, Crit, Error *Result
Status Status
Time time.Time
Unevaluated bool
IncidentId uint64
Warn, Crit *Result
Status Status
Time time.Time
Unevaluated bool
IncidentId uint64
}

type Result struct {
Expand All @@ -779,7 +786,6 @@ const (
StWarning
StCritical
StUnknown
StError
)

func (s Status) String() string {
Expand All @@ -792,8 +798,6 @@ func (s Status) String() string {
return "critical"
case StUnknown:
return "unknown"
case StError:
return "error"
default:
return "none"
}
Expand All @@ -807,7 +811,6 @@ func (s Status) IsNormal() bool { return s == StNormal }
func (s Status) IsWarning() bool { return s == StWarning }
func (s Status) IsCritical() bool { return s == StCritical }
func (s Status) IsUnknown() bool { return s == StUnknown }
func (s Status) IsError() bool { return s == StError }

type Action struct {
User string
Expand Down Expand Up @@ -1123,3 +1126,124 @@ type HostData struct {
}
SerialNumber string `json:",omitempty"`
}

//Alert Status is the current state of a single alert
type AlertStatus struct {
Success bool
Errors []*AlertError
}

type AlertError struct {
FirstTime, LastTime time.Time
Count int
Message string
}

func (s *Schedule) AlertSuccessful(name string) bool {
s.alertStatusLock.Lock()
defer s.alertStatusLock.Unlock()
if as, ok := s.AlertStatuses[name]; ok {
return as.Success
}
return true
}

func (s *Schedule) markAlertError(name string, err error) {
s.alertStatusLock.Lock()
defer s.alertStatusLock.Unlock()
as, ok := s.AlertStatuses[name]
if !ok {
as = &AlertStatus{}
s.AlertStatuses[name] = as
}
// if it succeeded prior to now, make a new error event.
// else if message is same as last, coalesce together.
// else append new event
now := time.Now().UTC().Truncate(time.Second)
newError := func() {
as.Errors = append(as.Errors, &AlertError{
FirstTime: now,
LastTime: now,
Count: 1,
Message: err.Error(),
})
}
if as.Success || len(as.Errors) == 0 {
newError()
} else {
last := as.Errors[len(as.Errors)-1]
if err.Error() == last.Message {
last.Count++
last.LastTime = now
} else {
newError()
}
}
as.Success = false
}

func (s *Schedule) markAlertSuccessful(name string) {
s.alertStatusLock.Lock()
defer s.alertStatusLock.Unlock()
as, ok := s.AlertStatuses[name]
if !ok {
as = &AlertStatus{}
s.AlertStatuses[name] = as
}
as.Success = true
}

func (s *Schedule) ClearErrorLine(alert string, startTime time.Time) {
s.alertStatusLock.Lock()
defer s.alertStatusLock.Unlock()
if as, ok := s.AlertStatuses[alert]; ok {
newErrors := make([]*AlertError, 0, len(as.Errors))
for _, err := range as.Errors {
if err.FirstTime != startTime {
newErrors = append(newErrors, err)
}
}
as.Errors = newErrors
if len(as.Errors) == 0 {
as.Success = true
}
}
}

func (s *Schedule) getErrorCounts() (failing, total int) {
failing = 0
total = 0
s.alertStatusLock.Lock()
defer s.alertStatusLock.Unlock()
for _, as := range s.AlertStatuses {
if !as.Success {
failing++
}
for _, err := range as.Errors {
total += err.Count
}
}
return
}

func (s *Schedule) GetErrorHistory() map[string]*AlertStatus {
s.alertStatusLock.Lock()
defer s.alertStatusLock.Unlock()
mapCopy := make(map[string]*AlertStatus, len(s.AlertStatuses))
for name, as := range s.AlertStatuses {
asCopy := &AlertStatus{
Success: as.Success,
Errors: make([]*AlertError, len(as.Errors)),
}
for i, err := range as.Errors {
asCopy.Errors[i] = &AlertError{
Count: err.Count,
FirstTime: err.FirstTime.UTC(),
LastTime: err.LastTime.UTC(),
Message: err.Message,
}
}
mapCopy[name] = asCopy
}
return mapCopy
}
Loading