From e5ec521ec4d138c26d0aa7d5b6c639f3703925c9 Mon Sep 17 00:00:00 2001 From: Patrick Toal Date: Wed, 17 Apr 2024 13:16:41 -0400 Subject: [PATCH] Add playbooks and templates for CPUHog --- playbooks/investigate_high_cpu.yml | 18 ++++++++++++++++++ rulebooks/alertmanager_listener.yml | 21 ++++++++++++++++----- templates/cpuhog_ticket.j2 | 20 ++++++++++++++++++++ 3 files changed, 54 insertions(+), 5 deletions(-) create mode 100644 playbooks/investigate_high_cpu.yml create mode 100644 templates/cpuhog_ticket.j2 diff --git a/playbooks/investigate_high_cpu.yml b/playbooks/investigate_high_cpu.yml new file mode 100644 index 0000000..f4ea349 --- /dev/null +++ b/playbooks/investigate_high_cpu.yml @@ -0,0 +1,18 @@ +--- +- name: Investigate High CPU + hosts: all + become: true + tasks: + - name: Gather information on top CPU consuming processes + ansible.builtin.command: + cmd: 'ps -eo pid,ppid,%mem,%cpu,cmd --sort=-%cpu' + register: processes_cpu + + - name: Gather information on top Memory consuming processes + ansible.builtin.command: + cmd: 'ps -eo pid,ppid,%mem,%cpu,cmd --sort=-%mem' + register: processes_mem + + - name: Dump CPU details + ansible.builtin.debug: + msg: "{{ lookup('template','../templates/cpuhog_ticket.j2') }}" \ No newline at end of file diff --git a/rulebooks/alertmanager_listener.yml b/rulebooks/alertmanager_listener.yml index 0765216..f646338 100644 --- a/rulebooks/alertmanager_listener.yml +++ b/rulebooks/alertmanager_listener.yml @@ -10,8 +10,8 @@ - name: Resolve Disk Usage condition: all: - - event.alert.labels.org == "OYS" and event.alert.status == "firing" \ - and event.alert.labels.alertname == "root filesystem over 80% full" + - event.alert.labels.org == "OYS" and event.alert.status == "firing" + - event.alert.labels.alertname == "root filesystem over 80% full" actions: - run_job_template: name: Demo - Clean Log Directory @@ -22,17 +22,28 @@ alertmanager_generator_url: "{{ event.alert.generatorURL }}" event_mountpoint: "{{ event.alert.labels.mountpoint }}" alertmanager_instance: "{{ event.alert.labels.instance }}" + - name: Investigate High CPU condition: all: - - event.alert.labels.org == "OYS" and event.alert.status == "firing" \ - and event.alert.labels.alertname == "ProcessCPUHog" + - event.alert.labels.org == "OYS" and event.alert.status == "firing" + - and event.alert.labels.alertname == "ProcessCPUHog" actions: - print_event: pretty: true + - run_job_template: + name: Demo - Investigate High CPU + organization: OYS + job_args: + extra_vars: + alertmanager_annotations: "{{ event.alert.annotations }}" + alertmanager_generator_url: "{{ event.alert.generatorURL }}" + event_severity: "{{ event.alert.labels.severity }}" + alertmanager_instance: "{{ event.alert.labels.instance }}" + event_values: "{{ event.alert.values }}" - name: Test Contact Point - condition: event.alert.labels.alertname == "TestAlert" or event.alert.labels.org == "OYS" + condition: event.alert.labels.alertname == "TestAlert" and event.alert.labels.org == "OYS" actions: - print_event: pretty: true diff --git a/templates/cpuhog_ticket.j2 b/templates/cpuhog_ticket.j2 new file mode 100644 index 0000000..f33f02d --- /dev/null +++ b/templates/cpuhog_ticket.j2 @@ -0,0 +1,20 @@ += CPUHog Report = +A high CPU event was triggered from AlertManager. + +{% if event is defined %} +Annotations: "{{ event.alert.annotations }}" +Generator URL: "{{ event.alert.generatorURL }}" +Severity: "{{ event.alert.labels.severity }}" +Instance: "{{ event.alert.labels.instance }}" +Values: "{{ event.alert.values }}" +{% endif %} + +** Top CPU Consumers ** +{% for line in processes_cpu.stdout_lines[0:10] %} +{{ line }} +{% endfor %} + +** Top Memory Consumers ** +{% for line in processes_mem.stdout_lines[0:10] %} +{{ line }} +{% endfor %} \ No newline at end of file