From aad1661a2d6aaf1d39061e1f5f640b5b0c9a915f Mon Sep 17 00:00:00 2001 From: Greg Bray Date: Fri, 12 Aug 2016 13:35:54 -0600 Subject: [PATCH] cmd/scollector: add a kill switch for total memory usage to catch any leaks outside the runtime --- cmd/scollector/collectors/collectors.go | 3 +++ cmd/scollector/collectors/processes_linux.go | 13 +++++++++--- .../collectors/processes_windows.go | 5 +++++ cmd/scollector/main.go | 20 ++++++++++++++----- 4 files changed, 33 insertions(+), 8 deletions(-) diff --git a/cmd/scollector/collectors/collectors.go b/cmd/scollector/collectors/collectors.go index 576505a199..afbe82c99d 100644 --- a/cmd/scollector/collectors/collectors.go +++ b/cmd/scollector/collectors/collectors.go @@ -120,6 +120,9 @@ var ( WatchProcessesDotNet = func() {} KeepalivedCommunity = "" + + //TotalScollectorMemory stores the total memory used by Scollector (including CGO and WMI) + TotalScollectorMemoryMB uint64 ) func init() { diff --git a/cmd/scollector/collectors/processes_linux.go b/cmd/scollector/collectors/processes_linux.go index 263ad7b662..65243a45ed 100644 --- a/cmd/scollector/collectors/processes_linux.go +++ b/cmd/scollector/collectors/processes_linux.go @@ -25,6 +25,8 @@ func AddProcessConfig(params conf.ProcessParams) error { var watchedProcs = []*WatchedProc{} +var osPageSize = os.Getpagesize() + // linuxCoreCount counts the number of logical cpus since that is how cpu ticks // are tracked func linuxCoreCount() (c int64, err error) { @@ -137,16 +139,20 @@ func linuxProcMonitor(w *WatchedProc, md *opentsdb.MultiDataPoint) error { Add(md, "linux.proc.mem.fault", stats[11], opentsdb.TagSet{"type": "majflt"}.Merge(tags), metadata.Counter, metadata.Fault, descLinuxProcMemFaultMax) virtual, err := strconv.ParseInt(stats[22], 10, 64) if err != nil { - return fmt.Errorf("failed to convert process user cpu: %v", err) + return fmt.Errorf("failed to convert process virtual memory: %v", err) } totalVirtualMem += virtual rss, err := strconv.ParseInt(stats[23], 10, 64) if err != nil { - return fmt.Errorf("failed to convert process system cpu: %v", err) + return fmt.Errorf("failed to convert process rss memory: %v", err) + } + if pid == string(os.Getpid()) { + TotalScollectorMemoryMB = uint64(rss) * uint64(osPageSize) / 1024 / 1024 } totalRSSMem += rss Add(md, "linux.proc.mem.virtual", stats[22], tags, metadata.Gauge, metadata.Bytes, descLinuxProcMemVirtual) Add(md, "linux.proc.mem.rss", stats[23], tags, metadata.Gauge, metadata.Page, descLinuxProcMemRss) + Add(md, "linux.proc.mem.rss_bytes", rss*int64(osPageSize), tags, metadata.Gauge, metadata.Bytes, descLinuxProcMemRssBytes) Add(md, "linux.proc.char_io", io[0], opentsdb.TagSet{"type": "read"}.Merge(tags), metadata.Counter, metadata.Bytes, descLinuxProcCharIoRead) Add(md, "linux.proc.char_io", io[1], opentsdb.TagSet{"type": "write"}.Merge(tags), metadata.Counter, metadata.Bytes, descLinuxProcCharIoWrite) Add(md, "linux.proc.syscall", io[2], opentsdb.TagSet{"type": "read"}.Merge(tags), metadata.Counter, metadata.Syscall, descLinuxProcSyscallRead) @@ -180,7 +186,8 @@ const ( descLinuxProcMemFaultMin = "The number of minor faults the process has made which have not required loading a memory page from disk." descLinuxProcMemFaultMax = "The number of major faults the process has made which have required loading a memory page from disk." descLinuxProcMemVirtual = "The virtual memory size." - descLinuxProcMemRss = "The resident set size (number of pages the process has in real memory." + descLinuxProcMemRss = "The resident set size (number of pages the process has in real memory including shared pages)." + descLinuxProcMemRssBytes = "The resident set size (number of bytes the process has in real memory including shared pages)." descLinuxProcCharIoRead = "The number of bytes which this task has caused to be read from storage. This is simply the sum of bytes which this process passed to read(2) and similar system calls. It includes things such as terminal I/O and is unaffected by whether or not actual physical disk I/O was required (the read might have been satisfied from pagecache)" descLinuxProcCharIoWrite = "The number of bytes which this task has caused, or shall cause to be written to disk. Similar caveats apply here as with read." descLinuxProcSyscallRead = "An attempt to count the number of read I/O operations—that is, system calls such as read(2) and pread(2)." diff --git a/cmd/scollector/collectors/processes_windows.go b/cmd/scollector/collectors/processes_windows.go index 4e3e4c93e1..aeb06f9674 100644 --- a/cmd/scollector/collectors/processes_windows.go +++ b/cmd/scollector/collectors/processes_windows.go @@ -2,6 +2,7 @@ package collectors import ( "fmt" + "os" "regexp" "strings" @@ -136,6 +137,10 @@ func c_windows_processes() (opentsdb.MultiDataPoint, error) { } } + if v.IDProcess == uint32(os.Getpid()) { + TotalScollectorMemoryMB = v.WorkingSetPrivate / 1024 / 1024 + } + if !(service_match || process_match || iis_match) { continue } diff --git a/cmd/scollector/main.go b/cmd/scollector/main.go index 5d841db8d6..7dee5e742d 100644 --- a/cmd/scollector/main.go +++ b/cmd/scollector/main.go @@ -245,17 +245,27 @@ func main() { } collect.MaxQueueLen = conf.MaxQueueLen } - maxMemMegaBytes := uint64(500) + maxMemMB := uint64(500) if conf.MaxMem != 0 { - maxMemMegaBytes = conf.MaxMem + maxMemMB = conf.MaxMem } go func() { - maxMemBytes := maxMemMegaBytes * 1024 * 1024 var m runtime.MemStats for range time.Tick(time.Second * 30) { runtime.ReadMemStats(&m) - if m.Alloc > maxMemBytes { - panic(fmt.Sprintf("memory max reached: (current: %v bytes, max: %v bytes)", m.Alloc, maxMemBytes)) + allocMB := m.Alloc / 1024 / 1024 + if allocMB > maxMemMB { + slog.Fatalf("memory max runtime reached: (current alloc: %v megabytes, max: %v megabytes)", allocMB, maxMemMB) + } + //See proccess_windows.go and process_linux.go for total process memory usage. + //Note that in linux the rss metric includes shared pages, where as in + //Windows the private working set does not include shared memory. + //Total memory used seems to scale linerarly with m.Alloc. + //But we want this to catch a memory leak outside the runtime (WMI/CGO). + //So for now just add any runtime allocations to the allowed total limit. + maxMemTotalMB := maxMemMB + allocMB + if collectors.TotalScollectorMemoryMB > maxMemTotalMB { + slog.Fatalf("memory max total reached: (current total: %v megabytes, current runtime alloc: %v megabytes, max: %v megabytes)", collectors.TotalScollectorMemoryMB, allocMB, maxMemTotalMB) } } }()