Add playbooks and templates for CPUHog
This commit is contained in:
18
playbooks/investigate_high_cpu.yml
Normal file
18
playbooks/investigate_high_cpu.yml
Normal file
@@ -0,0 +1,18 @@
|
||||
---
|
||||
- name: Investigate High CPU
|
||||
hosts: all
|
||||
become: true
|
||||
tasks:
|
||||
- name: Gather information on top CPU consuming processes
|
||||
ansible.builtin.command:
|
||||
cmd: 'ps -eo pid,ppid,%mem,%cpu,cmd --sort=-%cpu'
|
||||
register: processes_cpu
|
||||
|
||||
- name: Gather information on top Memory consuming processes
|
||||
ansible.builtin.command:
|
||||
cmd: 'ps -eo pid,ppid,%mem,%cpu,cmd --sort=-%mem'
|
||||
register: processes_mem
|
||||
|
||||
- name: Dump CPU details
|
||||
ansible.builtin.debug:
|
||||
msg: "{{ lookup('template','../templates/cpuhog_ticket.j2') }}"
|
||||
@@ -10,8 +10,8 @@
|
||||
- name: Resolve Disk Usage
|
||||
condition:
|
||||
all:
|
||||
- event.alert.labels.org == "OYS" and event.alert.status == "firing" \
|
||||
and event.alert.labels.alertname == "root filesystem over 80% full"
|
||||
- event.alert.labels.org == "OYS" and event.alert.status == "firing"
|
||||
- event.alert.labels.alertname == "root filesystem over 80% full"
|
||||
actions:
|
||||
- run_job_template:
|
||||
name: Demo - Clean Log Directory
|
||||
@@ -22,17 +22,28 @@
|
||||
alertmanager_generator_url: "{{ event.alert.generatorURL }}"
|
||||
event_mountpoint: "{{ event.alert.labels.mountpoint }}"
|
||||
alertmanager_instance: "{{ event.alert.labels.instance }}"
|
||||
|
||||
- name: Investigate High CPU
|
||||
condition:
|
||||
all:
|
||||
- event.alert.labels.org == "OYS" and event.alert.status == "firing" \
|
||||
and event.alert.labels.alertname == "ProcessCPUHog"
|
||||
- event.alert.labels.org == "OYS" and event.alert.status == "firing"
|
||||
- and event.alert.labels.alertname == "ProcessCPUHog"
|
||||
actions:
|
||||
- print_event:
|
||||
pretty: true
|
||||
- run_job_template:
|
||||
name: Demo - Investigate High CPU
|
||||
organization: OYS
|
||||
job_args:
|
||||
extra_vars:
|
||||
alertmanager_annotations: "{{ event.alert.annotations }}"
|
||||
alertmanager_generator_url: "{{ event.alert.generatorURL }}"
|
||||
event_severity: "{{ event.alert.labels.severity }}"
|
||||
alertmanager_instance: "{{ event.alert.labels.instance }}"
|
||||
event_values: "{{ event.alert.values }}"
|
||||
|
||||
- name: Test Contact Point
|
||||
condition: event.alert.labels.alertname == "TestAlert" or event.alert.labels.org == "OYS"
|
||||
condition: event.alert.labels.alertname == "TestAlert" and event.alert.labels.org == "OYS"
|
||||
actions:
|
||||
- print_event:
|
||||
pretty: true
|
||||
|
||||
20
templates/cpuhog_ticket.j2
Normal file
20
templates/cpuhog_ticket.j2
Normal file
@@ -0,0 +1,20 @@
|
||||
= CPUHog Report =
|
||||
A high CPU event was triggered from AlertManager.
|
||||
|
||||
{% if event is defined %}
|
||||
Annotations: "{{ event.alert.annotations }}"
|
||||
Generator URL: "{{ event.alert.generatorURL }}"
|
||||
Severity: "{{ event.alert.labels.severity }}"
|
||||
Instance: "{{ event.alert.labels.instance }}"
|
||||
Values: "{{ event.alert.values }}"
|
||||
{% endif %}
|
||||
|
||||
** Top CPU Consumers **
|
||||
{% for line in processes_cpu.stdout_lines[0:10] %}
|
||||
{{ line }}
|
||||
{% endfor %}
|
||||
|
||||
** Top Memory Consumers **
|
||||
{% for line in processes_mem.stdout_lines[0:10] %}
|
||||
{{ line }}
|
||||
{% endfor %}
|
||||
Reference in New Issue
Block a user