这是indexloc提供的服务,不要输入任何密码
Skip to content
This repository was archived by the owner on Feb 13, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions cmd/scollector/collectors/collectors.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,9 @@ var (
WatchProcessesDotNet = func() {}

KeepalivedCommunity = ""

//TotalScollectorMemory stores the total memory used by Scollector (including CGO and WMI)
TotalScollectorMemoryMB uint64
)

func init() {
Expand Down
13 changes: 10 additions & 3 deletions cmd/scollector/collectors/processes_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ func AddProcessConfig(params conf.ProcessParams) error {

var watchedProcs = []*WatchedProc{}

var osPageSize = os.Getpagesize()

// linuxCoreCount counts the number of logical cpus since that is how cpu ticks
// are tracked
func linuxCoreCount() (c int64, err error) {
Expand Down Expand Up @@ -137,16 +139,20 @@ func linuxProcMonitor(w *WatchedProc, md *opentsdb.MultiDataPoint) error {
Add(md, "linux.proc.mem.fault", stats[11], opentsdb.TagSet{"type": "majflt"}.Merge(tags), metadata.Counter, metadata.Fault, descLinuxProcMemFaultMax)
virtual, err := strconv.ParseInt(stats[22], 10, 64)
if err != nil {
return fmt.Errorf("failed to convert process user cpu: %v", err)
return fmt.Errorf("failed to convert process virtual memory: %v", err)
}
totalVirtualMem += virtual
rss, err := strconv.ParseInt(stats[23], 10, 64)
if err != nil {
return fmt.Errorf("failed to convert process system cpu: %v", err)
return fmt.Errorf("failed to convert process rss memory: %v", err)
}
if pid == string(os.Getpid()) {
TotalScollectorMemoryMB = uint64(rss) * uint64(osPageSize) / 1024 / 1024
}
totalRSSMem += rss
Add(md, "linux.proc.mem.virtual", stats[22], tags, metadata.Gauge, metadata.Bytes, descLinuxProcMemVirtual)
Add(md, "linux.proc.mem.rss", stats[23], tags, metadata.Gauge, metadata.Page, descLinuxProcMemRss)
Add(md, "linux.proc.mem.rss_bytes", rss*int64(osPageSize), tags, metadata.Gauge, metadata.Bytes, descLinuxProcMemRssBytes)
Add(md, "linux.proc.char_io", io[0], opentsdb.TagSet{"type": "read"}.Merge(tags), metadata.Counter, metadata.Bytes, descLinuxProcCharIoRead)
Add(md, "linux.proc.char_io", io[1], opentsdb.TagSet{"type": "write"}.Merge(tags), metadata.Counter, metadata.Bytes, descLinuxProcCharIoWrite)
Add(md, "linux.proc.syscall", io[2], opentsdb.TagSet{"type": "read"}.Merge(tags), metadata.Counter, metadata.Syscall, descLinuxProcSyscallRead)
Expand Down Expand Up @@ -180,7 +186,8 @@ const (
descLinuxProcMemFaultMin = "The number of minor faults the process has made which have not required loading a memory page from disk."
descLinuxProcMemFaultMax = "The number of major faults the process has made which have required loading a memory page from disk."
descLinuxProcMemVirtual = "The virtual memory size."
descLinuxProcMemRss = "The resident set size (number of pages the process has in real memory."
descLinuxProcMemRss = "The resident set size (number of pages the process has in real memory including shared pages)."
descLinuxProcMemRssBytes = "The resident set size (number of bytes the process has in real memory including shared pages)."
descLinuxProcCharIoRead = "The number of bytes which this task has caused to be read from storage. This is simply the sum of bytes which this process passed to read(2) and similar system calls. It includes things such as terminal I/O and is unaffected by whether or not actual physical disk I/O was required (the read might have been satisfied from pagecache)"
descLinuxProcCharIoWrite = "The number of bytes which this task has caused, or shall cause to be written to disk. Similar caveats apply here as with read."
descLinuxProcSyscallRead = "An attempt to count the number of read I/O operations—that is, system calls such as read(2) and pread(2)."
Expand Down
5 changes: 5 additions & 0 deletions cmd/scollector/collectors/processes_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package collectors

import (
"fmt"
"os"
"regexp"
"strings"

Expand Down Expand Up @@ -136,6 +137,10 @@ func c_windows_processes() (opentsdb.MultiDataPoint, error) {
}
}

if v.IDProcess == uint32(os.Getpid()) {
TotalScollectorMemoryMB = v.WorkingSetPrivate / 1024 / 1024
}

if !(service_match || process_match || iis_match) {
continue
}
Expand Down
20 changes: 15 additions & 5 deletions cmd/scollector/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -245,17 +245,27 @@ func main() {
}
collect.MaxQueueLen = conf.MaxQueueLen
}
maxMemMegaBytes := uint64(500)
maxMemMB := uint64(500)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure I love the default of having the kill switch active. May surprise some people.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It has always been active, but only for runtime memory. This adds a second check for total memory usage.

We were using over 1GB on many systems (around 80GB total on all systems) due to a WMI leak, which could have caused other systems to fail if we hadn't noticed.

if conf.MaxMem != 0 {
maxMemMegaBytes = conf.MaxMem
maxMemMB = conf.MaxMem
}
go func() {
maxMemBytes := maxMemMegaBytes * 1024 * 1024
var m runtime.MemStats
for range time.Tick(time.Second * 30) {
runtime.ReadMemStats(&m)
if m.Alloc > maxMemBytes {
panic(fmt.Sprintf("memory max reached: (current: %v bytes, max: %v bytes)", m.Alloc, maxMemBytes))
allocMB := m.Alloc / 1024 / 1024
if allocMB > maxMemMB {
slog.Fatalf("memory max runtime reached: (current alloc: %v megabytes, max: %v megabytes)", allocMB, maxMemMB)
}
//See proccess_windows.go and process_linux.go for total process memory usage.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it true that this will only work if they happen to be monitoring the scollector process? Or maybe just any process monitoring enabled at all?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have you tried this with the wmi leaky build and seen it trigger at the appropriate time?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tested by removing all the defer calls in the WMI package. It took 38 minutes for scollector to panic:

image

//Note that in linux the rss metric includes shared pages, where as in
//Windows the private working set does not include shared memory.
//Total memory used seems to scale linerarly with m.Alloc.
//But we want this to catch a memory leak outside the runtime (WMI/CGO).
//So for now just add any runtime allocations to the allowed total limit.
maxMemTotalMB := maxMemMB + allocMB
if collectors.TotalScollectorMemoryMB > maxMemTotalMB {
slog.Fatalf("memory max total reached: (current total: %v megabytes, current runtime alloc: %v megabytes, max: %v megabytes)", collectors.TotalScollectorMemoryMB, allocMB, maxMemTotalMB)
}
}
}()
Expand Down