ganglia/monitor-core

gmond / sflow / segmentation fault / xml mal formed

limbo127 opened this issue · 7 comments

Hello,
I configure gmond to receive sflow from host sflow agent.
then a simple telnet 127.0.0.1 8649 kill gmond with segfault.

[root@retd ~]# gmond --version
gmond 3.7.2

[root@retd ~]# gmond -f -d 2 -c /etc/ganglia/gmond.conf
loaded module: core_metrics
loaded module: cpu_module
loaded module: disk_module
loaded module: load_module
loaded module: mem_module
loaded module: net_module
loaded module: proc_module
loaded module: sys_module
loaded module: multicpu_module
udp_recv_channel mcast_join=NULL mcast_if=NULL port=6344 bind=NULL buffer=0
socket created, SO_RCVBUF = 212992

tcp_accept_channel bind=NULL port=8649 gzip_output=0
[tcp] Starting TCP listener thread...
spoofName: Nico-hard spoofIP: 10.10.3.230

first time - allocate SFlowAgent
create subAgent 10.10.3.230:100000
create datasource 10.10.3.230:100000-2:1
sequence number error - 10.10.3.230:100000-2:1 lostSamples=0
**Allocating metadata packet for host--Nico-hard-- and metric --heartbeat-- ***

saving metadata for metric: heartbeat host: Nico-hard
**Allocating value packet for host--(null)-- and metric --heartbeat-- ***

Got a heartbeat message 1456301617

**Allocating metadata packet for host--Nico-hard-- and metric --os_release-- ***

saving metadata for metric: os_release host: Nico-hard
**Allocating value packet for host--(null)-- and metric --os_release-- ***

**Allocating metadata packet for host--Nico-hard-- and metric --uuid-- ***
......
saving metadata for metric: win10.vdrops_out host: Nico-hard
[tcp] Request for XML data received.
Segmentation fault


telnet 127.0.0.1 8649
Trying 127.0.0.1...
Connected to 127.0.0.1.
Escape character is '^]'.

  <!ATTLIST GANGLIA_XML VERSION CDATA #REQUIRED>
  <!ATTLIST GANGLIA_XML SOURCE CDATA #REQUIRED>
  <!ATTLIST GRID NAME CDATA #REQUIRED>
  <!ATTLIST GRID AUTHORITY CDATA #REQUIRED>
  <!ATTLIST GRID LOCALTIME CDATA #IMPLIED>
  <!ATTLIST CLUSTER NAME CDATA #REQUIRED>
  <!ATTLIST CLUSTER OWNER CDATA #IMPLIED>
  <!ATTLIST CLUSTER LATLONG CDATA #IMPLIED>
  <!ATTLIST CLUSTER URL CDATA #IMPLIED>
  <!ATTLIST CLUSTER LOCALTIME CDATA #REQUIRED>
  <!ATTLIST HOST NAME CDATA #REQUIRED>
  <!ATTLIST HOST IP CDATA #REQUIRED>
  <!ATTLIST HOST LOCATION CDATA #IMPLIED>
  <!ATTLIST HOST TAGS CDATA #IMPLIED>
  <!ATTLIST HOST REPORTED CDATA #REQUIRED>
  <!ATTLIST HOST TN CDATA #IMPLIED>
  <!ATTLIST HOST TMAX CDATA #IMPLIED>
  <!ATTLIST HOST DMAX CDATA #IMPLIED>
  <!ATTLIST HOST GMOND_STARTED CDATA #IMPLIED>
  <!ATTLIST METRIC NAME CDATA #REQUIRED>
  <!ATTLIST METRIC VAL CDATA #REQUIRED>
  <!ATTLIST METRIC TYPE (string | int8 | uint8 | int16 | uint16 | int32 | uint32 | float | double | timestamp) #REQUIRED>
  <!ATTLIST METRIC UNITS CDATA #IMPLIED>
  <!ATTLIST METRIC TN CDATA #IMPLIED>
  <!ATTLIST METRIC TMAX CDATA #IMPLIED>
  <!ATTLIST METRIC DMAX CDATA #IMPLIED>
  <!ATTLIST METRIC SLOPE (zero | positive | negative | both | unspecified) #IMPLIED>
  <!ATTLIST METRIC SOURCE (gmond) 'gmond'>
  <!ATTLIST EXTRA_ELEMENT NAME CDATA #REQUIRED>
  <!ATTLIST EXTRA_ELEMENT VAL CDATA #REQUIRED>
  <!ATTLIST HOSTS UP CDATA #REQUIRED>
  <!ATTLIST HOSTS DOWN CDATA #REQUIRED>
  <!ATTLIST HOSTS SOURCE (gmond | gmetad) #REQUIRED>
  <!ATTLIST METRICS NAME CDATA #REQUIRED>
  <!ATTLIST METRICS SUM CDATA #REQUIRED>
  <!ATTLIST METRICS NUM CDATA #REQUIRED>
  <!ATTLIST METRICS TYPE (string | int8 | uint8 | int16 | uint16 | int32 | uint32 | float | double | timestamp) #REQUIRED>
  <!ATTLIST METRICS UNITS CDATA #IMPLIED>
  <!ATTLIST METRICS SLOPE (zero | positive | negative | both | unspecified) #IMPLIED>
  <!ATTLIST METRICS SOURCE (gmond) 'gmond'>

]>
<GANGLIA_XML VERSION="3.7.2" SOURCE="gmond">



<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="Total Running Processes"/>
<EXTRA_ELEMENT NAME="DESC" VAL="Total Running Processes"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="process"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="win10: VM Packets Received"/>
<EXTRA_ELEMENT NAME="DESC" VAL="win10: VM Packets Received"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="vm network"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="CPU Guest"/>
<EXTRA_ELEMENT NAME="DESC" VAL="CPU Guest"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="cpu"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="win10: VM Bytes Received"/>
<EXTRA_ELEMENT NAME="DESC" VAL="win10: VM Bytes Received"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="vm network"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="win10: VM Bytes Sent"/>
<EXTRA_ELEMENT NAME="DESC" VAL="win10: VM Bytes Sent"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="vm network"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="win10: VM Output Drops"/>
<EXTRA_ELEMENT NAME="DESC" VAL="win10: VM Output Drops"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="vm network"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="Output Errors"/>
<EXTRA_ELEMENT NAME="DESC" VAL="Output Errors"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="network"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="Pages In"/>
<EXTRA_ELEMENT NAME="DESC" VAL="Pages In"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="memory"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="CPU I/O Wait"/>
<EXTRA_ELEMENT NAME="DESC" VAL="CPU I/O Wait"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="cpu"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="Cached Memory"/>
<EXTRA_ELEMENT NAME="DESC" VAL="Cached Memory"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="memory"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="win10: VM Disk Errors"/>
<EXTRA_ELEMENT NAME="DESC" VAL="win10: VM Disk Errors"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="vm disk"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="Hypervisor Free Memory"/>
<EXTRA_ELEMENT NAME="DESC" VAL="Hypervisor Free Memory"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="hypervisor"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="Free Swap Space"/>
<EXTRA_ELEMENT NAME="DESC" VAL="Free Swap Space"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="memory"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="Interrupts"/>
<EXTRA_ELEMENT NAME="DESC" VAL="Interrupts"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="cpu"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="Memory Total"/>
<EXTRA_ELEMENT NAME="DESC" VAL="Memory Total"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="memory"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="Last Boot Time"/>
<EXTRA_ELEMENT NAME="DESC" VAL="Last Boot Time"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="cpu"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="win10: VM Input Errors"/>
<EXTRA_ELEMENT NAME="DESC" VAL="win10: VM Input Errors"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="vm network"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="win10: VM CPU Count"/>
<EXTRA_ELEMENT NAME="DESC" VAL="win10: VM CPU Count"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="vm cpu"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="CPU System"/>
<EXTRA_ELEMENT NAME="DESC" VAL="CPU System"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="cpu"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="Free Disk Space"/>
<EXTRA_ELEMENT NAME="DESC" VAL="Free Disk Space"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="disk"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="win10: VM Reads"/>
<EXTRA_ELEMENT NAME="DESC" VAL="win10: VM Reads"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="vm disk"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="Hypervisor CPU Count"/>
<EXTRA_ELEMENT NAME="DESC" VAL="Hypervisor CPU Count"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="hypervisor"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="win10: VM Memory Total"/>
<EXTRA_ELEMENT NAME="DESC" VAL="win10: VM Memory Total"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="vm memory"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="Packets Received"/>
<EXTRA_ELEMENT NAME="DESC" VAL="Packets Received"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="network"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="Packets Sent"/>
<EXTRA_ELEMENT NAME="DESC" VAL="Packets Sent"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="network"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="Write Time"/>
<EXTRA_ELEMENT NAME="DESC" VAL="Write Time"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="disk"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="win10: VM Writes"/>
<EXTRA_ELEMENT NAME="DESC" VAL="win10: VM Writes"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="vm disk"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="Hypervisor Memory Total"/>
<EXTRA_ELEMENT NAME="DESC" VAL="Hypervisor Memory Total"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="hypervisor"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="Input Drops"/>
<EXTRA_ELEMENT NAME="DESC" VAL="Input Drops"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="network"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="Writes"/>
<EXTRA_ELEMENT NAME="DESC" VAL="Writes"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="disk"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="win10: Free VDisk Space"/>
<EXTRA_ELEMENT NAME="DESC" VAL="win10: Free VDisk Space"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="vm disk"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="One minute load average"/>
<EXTRA_ELEMENT NAME="DESC" VAL="One minute load average"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="load"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="win10: VM Output Errors"/>
<EXTRA_ELEMENT NAME="DESC" VAL="win10: VM Output Errors"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="vm network"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="Read Time"/>
<EXTRA_ELEMENT NAME="DESC" VAL="Read Time"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="disk"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="Memory Buffers"/>
<EXTRA_ELEMENT NAME="DESC" VAL="Memory Buffers"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="memory"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="win10: VDisk Capacity"/>
<EXTRA_ELEMENT NAME="DESC" VAL="win10: VDisk Capacity"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="vm disk"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="Maximum Disk Space Used"/>
<EXTRA_ELEMENT NAME="DESC" VAL="Maximum Disk Space Used"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="disk"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="Reads"/>
<EXTRA_ELEMENT NAME="DESC" VAL="Reads"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="disk"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="win10: VM Memory Utilization"/>
<EXTRA_ELEMENT NAME="DESC" VAL="win10: VM Memory Utilization"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="vm memory"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="win10: VM Bytes Written"/>
<EXTRA_ELEMENT NAME="DESC" VAL="win10: VM Bytes Written"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="vm disk"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="Swap Pages In"/>
<EXTRA_ELEMENT NAME="DESC" VAL="Swap Pages In"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="memory"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="Input Errors"/>
<EXTRA_ELEMENT NAME="DESC" VAL="Input Errors"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="network"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="CPU Count"/>
<EXTRA_ELEMENT NAME="DESC" VAL="CPU Count"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="cpu"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="win10: VM CPU Utilization"/>
<EXTRA_ELEMENT NAME="DESC" VAL="win10: VM CPU Utilization"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="vm cpu"/>
</EXTRA_DATA>


<EXTRA_DATA>
<EXTRA_ELEMENT NAME="TITLE" VAL="CPU Speed"/>
<EXTRA_ELEMENT NAME="DESC" VAL="CPU Speed"/>
<EXTRA_ELEMENT NAME="GROUP" VAL="cpu"/>
</EXTRA_DATA>

Connection closed by foreign host.

I have the same problem with the same version of Gmond. If I comment out the Sflow stuff in the gmond.conf then the problem goes away. This is pretty major for us we're using gmonds to collect slfow data from computer labs at the university to monitor usage.

Stack trace of the crash:
(gdb) backtrace
#0 strlen () at ../sysdeps/x86_64/strlen.S:106
#1 0x00007f9951f438e8 in apr_vformatter (
flush_func=flush_func@entry=0x7f9951f42190 <snprintf_flush>,
vbuff=vbuff@entry=0x7f9946233340, fmt=0x55a18820b011 "s",
ap=ap@entry=0x7f9946233350) at strings/apr_snprintf.c:974
#2 0x00007f9951f44cc9 in apr_snprintf (buf=, len=1024,
format=) at strings/apr_snprintf.c:1379
#3 0x000055a187bc84da in gmetric_value_to_str (message=0x55a1881f7958)
at gmond.c:1937
#4 print_host_metric (now=1480604216779882, val=0x55a1881f7948,
data=0x55a1881f78b0, client=0x55a188127c60) at gmond.c:1972
#5 process_tcp_accept_channel (desc=, desc=,
now=1480604216779882) at gmond.c:2106
#6 poll_tcp_listen_channels (timeout=100000, now=1480604216779882)
at gmond.c:2207
#7 tcp_listener (thd=0x55a1881052b8, data=) at gmond.c:3285
#8 0x00007f9950c0e5ca in start_thread (arg=0x7f9946234700)
at pthread_create.c:333
#9 0x00007f99509480ed in clone ()
at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109

Line which seems to be at fault.

case gmetric_int:
/* For right now.. there are no metric which are signed ints... use u_int */
apr_snprintf(value, 1024, message->Ganglia_value_msg_u.gs_int.fmt, message->Ganglia_value_msg_u.gs_int.si);
return value;

Added some debug statements to try and work out whats causing the problem:

// in print_host_metric
Last metric before the bang was machine_type
// in gmetric_value_to_str
in case statement (uint) format is %s uint is 971708432

Segmentation fault

Is the right, I've printed some of the others and the formatting string seems to match the output.

Found and fixed the issue, but I'm not sure how I'd go about submitting this? The bug is in slfow.c

static void
submit_sflow_string(Ganglia_host *hostdata, char *metric_prefix, EnumSFLOWGMetric tag, const char *val, bool_t ok)
{
Ganglia_metadata_msg fmsg = { 0 };
Ganglia_value_msg vmsg = { 0 };
char *mname, *mtitle;
char mname_buf[SFLOW_MAX_METRIC_NAME_LEN];
char mtitle_buf[SFLOW_MAX_METRIC_NAME_LEN];
if(ok || sflowCFG.submit_null_str) {
set_metric_name_and_title(&mname, &mtitle, mname_buf, mtitle_buf, metric_prefix, tag);
// Bug is the next line sets the wrong type, all the conversion functions are similar probably
// an error introduced though copy and paste during development.
// fmsg.id = vmsg.id = gmetric_uint;
fmsg.id = vmsg.id = gmetric_string;
fmsg.Ganglia_metadata_msg_u.gfull.metric.type = "string";
vmsg.Ganglia_value_msg_u.gstr.metric_id.name = mname;
vmsg.Ganglia_value_msg_u.gstr.str = (ok ? (char *)val : sflowCFG.null_str);
vmsg.Ganglia_value_msg_u.gstr.fmt = SFLOWGMetricTable[tag].format;
submit_sflow_gmetric(hostdata, mname, mtitle, tag, &fmsg, &vmsg);
}
}

You can submit a pull request on Github.

I'll try and do that early this week. Many thanks.

sflow commented

Looks like this was my bug (from about 5 years ago). Sorry for the trouble! Glad you found the fix.

Neil