diff --git a/cmd/scollector/collectors/collectors.go b/cmd/scollector/collectors/collectors.go index 450e5e180a..ea6f1031df 100644 --- a/cmd/scollector/collectors/collectors.go +++ b/cmd/scollector/collectors/collectors.go @@ -30,6 +30,7 @@ type Collector interface { ApplyTagOverrides(opentsdb.TagSet) } +//These should be in alphabetical order to help prevent merge conflicts const ( osCPU = "os.cpu" osCPUClock = "os.cpu.clock" @@ -41,32 +42,32 @@ const ( osMemPctFree = "os.mem.percent_free" osMemTotal = "os.mem.total" osMemUsed = "os.mem.used" + osNetAdminStatus = "os.net.admin_status" osNetBondBroadcast = "os.net.bond.packets_broadcast" osNetBondBytes = "os.net.bond.bytes" osNetBondDropped = "os.net.bond.dropped" osNetBondErrors = "os.net.bond.errs" + osNetBondIfSpeed = "os.net.bond.ifspeed" osNetBondMulticast = "os.net.bond.packets_multicast" osNetBondPackets = "os.net.bond.packets" osNetBondUnicast = "os.net.bond.packets_unicast" - osNetBondIfSpeed = "os.net.bond.ifspeed" - osNetIfSpeed = "os.net.ifspeed" osNetBroadcast = "os.net.packets_broadcast" osNetBytes = "os.net.bytes" osNetDropped = "os.net.dropped" osNetErrors = "os.net.errs" + osNetIfSpeed = "os.net.ifspeed" + osNetMTU = "os.net.mtu" osNetMulticast = "os.net.packets_multicast" + osNetOperStatus = "os.net.oper_status" osNetPackets = "os.net.packets" osNetPauseFrames = "os.net.pause_frames" osNetUnicast = "os.net.packets_unicast" - osSystemUptime = "os.system.uptime" - osNetMTU = "os.net.mtu" - osNetAdminStatus = "os.net.admin_status" - osNetOperStatus = "os.net.oper_status" - osServiceRunning = "os.service.running" - osProcCPU = "os.proc.cpu" osProcCount = "os.proc.count" + osProcCPU = "os.proc.cpu" osProcMemReal = "os.proc.mem.real" osProcMemVirtual = "os.proc.mem.virtual" + osServiceRunning = "os.service.running" + osSystemUptime = "os.system.uptime" ) const ( @@ -79,24 +80,24 @@ const ( osMemPctFreeDesc = "The percent of free memory. In Linux free memory includes memory used by buffers and cache." osMemTotalDesc = "Total amount, in bytes, of physical memory available to the operating system." osMemUsedDesc = "The amount of used memory. In Linux this excludes memory used by buffers and cache." + osNetAdminStatusDesc = "The desired state of the interface. The testing(3) state indicates that no operational packets can be passed. When a managed system initializes, all interfaces start with ifAdminStatus in the down(2) state. As a result of either explicit management action or per configuration information retained by the managed system, ifAdminStatus is then changed to either the up(1) or testing(3) states (or remains in the down(2) state)." osNetBroadcastDesc = "The rate at which broadcast packets are sent or received on the network interface." osNetBytesDesc = "The rate at which bytes are sent or received over the network interface." osNetDroppedDesc = "The number of packets that were chosen to be discarded even though no errors had been detected to prevent transmission." osNetErrorsDesc = "The number of packets that could not be transmitted because of errors." - osNetMulticastDesc = "The rate at which multicast packets are sent or received on the network interface." - osNetPacketsDesc = "The rate at which packets are sent or received on the network interface." - osNetUnicastDesc = "The rate at which unicast packets are sent or received on the network interface." osNetIfSpeedDesc = "The total link speed of the network interface in Megabits per second." - osNetPauseFrameDesc = "The rate of pause frames sent or recieved on the network interface. An overwhelmed network element can send a pause frame, which halts the transmission of the sender for a specified period of time." - osSystemUptimeDesc = "Seconds since last reboot." osNetMTUDesc = "The maximum transmission unit for the ethernet frame." - osNetAdminStatusDesc = "The desired state of the interface. The testing(3) state indicates that no operational packets can be passed. When a managed system initializes, all interfaces start with ifAdminStatus in the down(2) state. As a result of either explicit management action or per configuration information retained by the managed system, ifAdminStatus is then changed to either the up(1) or testing(3) states (or remains in the down(2) state)." + osNetMulticastDesc = "The rate at which multicast packets are sent or received on the network interface." osNetOperStatusDesc = "The current operational state of the interface. The testing(3) state indicates that no operational packets can be passed. If ifAdminStatus is down(2) then ifOperStatus should be down(2). If ifAdminStatus is changed to up(1) then ifOperStatus should change to up(1) if the interface is ready to transmit and receive network traffic; it should change to dormant(5) if the interface is waiting for external actions (such as a serial line waiting for an incoming connection); it should remain in the down(2) state if and only if there is a fault that prevents it from going to the up(1) state; it should remain in the notPresent(6) state if the interface has missing (typically, hardware) components." - osServiceRunningDesc = "1: active, 0: inactive" - osProcCPUDesc = "The summed percentage of CPU time used by processes with this name (0-100)." + osNetPacketsDesc = "The rate at which packets are sent or received on the network interface." + osNetPauseFrameDesc = "The rate of pause frames sent or recieved on the network interface. An overwhelmed network element can send a pause frame, which halts the transmission of the sender for a specified period of time." + osNetUnicastDesc = "The rate at which unicast packets are sent or received on the network interface." osProcCountDesc = "The number of processes running with this name." + osProcCPUDesc = "The summed percentage of CPU time used by processes with this name (0-100)." osProcMemRealDesc = "The total amount of real memory used by the processes with this name. For Linux this is RSS and in Windows it is the private working set." osProcMemVirtualDesc = "The total amount of virtual memory used by the processes with this name." + osServiceRunningDesc = "1: active, 0: inactive" + osSystemUptimeDesc = "Seconds since last reboot." ) var ( @@ -135,18 +136,43 @@ func now() (t int64) { return } +func matchPattern(s string, patterns []string) bool { + for _, p := range patterns { + if !strings.HasPrefix(p, "-") { + if strings.Contains(s, p) { + return true + } + } + } + return false +} + +func matchInvertPattern(s string, patterns []string) bool { + for _, p := range patterns { + if strings.HasPrefix(p, "-") { + var np = p[1:] + if strings.Contains(s, np) { + return true + } + } + } + return false +} + // Search returns all collectors matching the pattern s. func Search(s []string) []Collector { if len(s) == 0 { return collectors } var r []Collector + sort.Strings(s) + i := sort.SearchStrings(s, "*") + IncludeAll := i < len(s) && s[i] == "*" for _, c := range collectors { - for _, p := range s { - if strings.Contains(c.Name(), p) { - r = append(r, c) - break - } + if matchInvertPattern(c.Name(), s) { + continue + } else if IncludeAll || matchPattern(c.Name(), s) { + r = append(r, c) } } return r diff --git a/cmd/scollector/collectors/httpunit.go b/cmd/scollector/collectors/httpunit.go index 1101b3439c..5f426520c1 100644 --- a/cmd/scollector/collectors/httpunit.go +++ b/cmd/scollector/collectors/httpunit.go @@ -10,33 +10,31 @@ import ( "github.com/StackExchange/httpunit" ) -func HTTPUnitTOML(filename string) error { +func HTTPUnitTOML(filename string, freq time.Duration) error { var plans httpunit.Plans if _, err := toml.DecodeFile(filename, &plans); err != nil { return err } - HTTPUnitPlans(filename, &plans) + HTTPUnitPlans(filename, &plans, freq) return nil } -func HTTPUnitHiera(filename string) error { +func HTTPUnitHiera(filename string, freq time.Duration) error { plans, err := httpunit.ExtractHiera(filename) if err != nil { return err } - HTTPUnitPlans(filename, &httpunit.Plans{ - Plans: plans, - }) + HTTPUnitPlans(filename, &httpunit.Plans{Plans: plans}, freq) return nil } -func HTTPUnitPlans(name string, plans *httpunit.Plans) { +func HTTPUnitPlans(name string, plans *httpunit.Plans, freq time.Duration) { collectors = append(collectors, &IntervalCollector{ F: func() (opentsdb.MultiDataPoint, error) { return cHTTPUnit(plans) }, name: fmt.Sprintf("c_httpunit_%s", name), - Interval: time.Minute * 5, + Interval: freq, }) } @@ -53,8 +51,10 @@ func cHTTPUnit(plans *httpunit.Plans) (opentsdb.MultiDataPoint, error) { "url_host": r.Case.URL.Host, "hc_test_case": r.Plan.Label, } + ms := r.Result.TimeTotal / time.Millisecond Add(&md, "hu.error", r.Result.Result != nil, tags, metadata.Gauge, metadata.Bool, descHTTPUnitError) Add(&md, "hu.socket_connected", r.Result.Connected, tags, metadata.Gauge, metadata.Bool, descHTTPUnitSocketConnected) + Add(&md, "hu.time_total", ms, tags, metadata.Gauge, metadata.MilliSecond, descHTTPUnitTotalTime) switch r.Case.URL.Scheme { case "http", "https": Add(&md, "hu.http.got_expected_code", r.Result.GotCode, tags, metadata.Gauge, metadata.Bool, descHTTPUnitExpectedCode) @@ -79,4 +79,5 @@ const ( descHTTPUnitExpectedRegex = "1 if the response matched expected regex, else 0." descHTTPUnitCertValid = "1 if the SSL certificate is valid, else 0." descHTTPUnitCertExpires = "Unix epoch time of the certificate expiration." + descHTTPUnitTotalTime = "Total time consumed by test case." ) diff --git a/cmd/scollector/conf/conf.go b/cmd/scollector/conf/conf.go index 22d46a2c98..74a6c4b6f7 100644 --- a/cmd/scollector/conf/conf.go +++ b/cmd/scollector/conf/conf.go @@ -157,6 +157,7 @@ type ProcessDotNet struct { type HTTPUnit struct { TOML string Hiera string + Freq string } type Riak struct { diff --git a/cmd/scollector/doc.go b/cmd/scollector/doc.go index 13d440c829..56f7d127d8 100644 --- a/cmd/scollector/doc.go +++ b/cmd/scollector/doc.go @@ -22,8 +22,9 @@ The flags are: -h="" OpenTSDB or Bosun host. Overrides Host in conf file. -f="" - Filters collectors matching these terms, separated by - comma. Overrides Filter in conf file. + Only include collectors matching these comma separated terms. Prefix + with - to invert match and exclude collectors matching those terms. Use + *,-term,-anotherterm to include all collectors except excluded terms. -b=0 OpenTSDB batch size. Default is 500. -conf="" @@ -102,10 +103,12 @@ Default is 500. MaxQueueLen (integer): is the number of metrics keept internally. Default is 200000. -Filter (array of string): filters collectors matching these terms. +Filter (array of string): Only include collectors matching these terms. Prefix +with - to invert match and exclude collectors matching those terms. Use +*,-term,-anotherterm to include all collectors except excluded terms. -MetricFilters (array of string): filters metrics matching these regular -expressions. +MetricFilters (array of string): only send metrics matching these regular +expressions. Example ['^(win\.cpu|win\.system\..*)$', 'free'] IfaceExpr (string): Replaces the default regular expression for interface name matching on Linux. @@ -214,13 +217,14 @@ ProcessDotNet. HTTPUnit (array of table, keys are TOML, Hiera): httpunit TOML and Hiera files to read and monitor. See https://github.com/StackExchange/httpunit for documentation about the toml file. TOML and Hiera may both be specified, -or just one. +or just one. Freq is collector frequency as a duration string (default 5m). [[HTTPUnit]] TOML = "/path/to/httpunit.toml" Hiera = "/path/to/listeners.json" [[HTTPUnit]] TOML = "/some/other.toml" + Freq = "30s" Riak (array of table, keys are URL): Riak hosts to poll. diff --git a/cmd/scollector/main.go b/cmd/scollector/main.go index bd93e08226..8c050c120c 100644 --- a/cmd/scollector/main.go +++ b/cmd/scollector/main.go @@ -100,6 +100,7 @@ func main() { } collectors.Init(conf) for _, r := range conf.MetricFilters { + slog.Infof("Adding MetricFilter: %v\n", r) check(collectors.AddMetricFilters(r)) } for _, rmq := range conf.RabbitMQ { @@ -124,11 +125,24 @@ func main() { check(collectors.AddProcessDotNetConfig(p)) } for _, h := range conf.HTTPUnit { + var freq time.Duration + var parseerr error + if h.Freq == "" { + freq = time.Minute * 5 + } else { + freq, parseerr = time.ParseDuration(h.Freq) + if parseerr != nil { + slog.Fatal(parseerr) + } + if freq < time.Second { + slog.Fatalf("Invalid HTTPUnit frequency %s, cannot be less than 1 second.", h.Freq) + } + } if h.TOML != "" { - check(collectors.HTTPUnitTOML(h.TOML)) + check(collectors.HTTPUnitTOML(h.TOML, freq)) } if h.Hiera != "" { - check(collectors.HTTPUnitHiera(h.Hiera)) + check(collectors.HTTPUnitHiera(h.Hiera, freq)) } } for _, r := range conf.Riak {