diff --git a/ansible/roles/prometheus/tasks/main.yml b/ansible/roles/prometheus/tasks/main.yml index 25525ae..d6a1b7c 100644 --- a/ansible/roles/prometheus/tasks/main.yml +++ b/ansible/roles/prometheus/tasks/main.yml @@ -1,62 +1,70 @@ ---- - -- name: Load variables - include_vars: "{{ ansible_os_family|lower }}.yml" - -- name: Install Prometheus - tags: prometheus-install - ansible.builtin.package: - name: '{{ prometheus_package }}' - state: latest - -- name: Add scrape configuration - tags: prometheus-scrape-configure - ansible.builtin.copy: - src: "{{ item.src }}" - dest: "/etc/prometheus/{{ item.dest }}" - loop: - - { src: '../templates/scrape-main.yml', dest: 'scrape-main.yml' } - - { src: '../templates/scrape-blackbox.yml', dest: 'scrape-blackbox.yml' } - notify: - - restart prometheus - -- name: Create folder for rules definition - ansible.builtin.file: - path: /etc/prometheus/rules - state: directory - mode: 0755 - -- name: Add rules configuration - tags: alertmanager-rules-configure - ansible.builtin.copy: - src: "{{ item.src }}" - dest: "{{ alertmanager_rules }}/{{ item.dest }}" - loop: - - { src: '../templates/hosts.rules', dest: 'hosts.rules.yml' } - - { src: '../templates/prometheus.rules', dest: 'prometheus.rules.yml' } - - { src: '../templates/blackbox.rules', dest: 'blackbox.rules.yml' } - notify: - - restart prometheus - - restart alertmanager - -- name: Enable Prometheus configuration - tags: prometheus-configure - ansible.builtin.template: - src: prometheus.config.j2 - dest: '{{ prometheus_config }}' - validate: promtool check config %s - notify: restart prometheus - -- name: Enable Prometheus service - tags: prometheus-enable - ansible.builtin.service: - name: '{{ prometheus_service }}' - state: started - enabled: yes - -- name: Enable Alertmanager service - tags: alertmanager-enable - ansible.builtin.service: - name: '{{ alertmanager_service }}' - state: started - enabled: yes +--- + +- name: Load variables + include_vars: "{{ ansible_os_family|lower }}.yml" + +- name: Install Prometheus + tags: prometheus-install + ansible.builtin.package: + name: '{{ prometheus_package }}' + state: latest + +- name: Add scrape configuration + tags: prometheus-scrape-configure + ansible.builtin.copy: + src: "{{ item.src }}" + dest: "/etc/prometheus/{{ item.dest }}" + loop: + - { src: '../templates/scrape-main.yml', dest: 'scrape-main.yml' } + - { src: '../templates/scrape-blackbox.yml', dest: 'scrape-blackbox.yml' } + notify: + - restart prometheus + +- name: Add alertmanager configuration + tags: alertmanager-configure + ansible.builtin.template: + src: alertmanager.yml.j2 + dest: '{{ alertmanager_config }}' + notify: + - restart alertmanager + +- name: Create folder for rules definition + ansible.builtin.file: + path: /etc/prometheus/rules + state: directory + mode: 0755 + +- name: Add rules configuration + tags: alertmanager-rules-configure + ansible.builtin.copy: + src: "{{ item.src }}" + dest: "{{ alertmanager_rules }}/{{ item.dest }}" + loop: + - { src: '../templates/hosts.rules', dest: 'hosts.rules.yml' } + - { src: '../templates/prometheus.rules', dest: 'prometheus.rules.yml' } + - { src: '../templates/blackbox.rules', dest: 'blackbox.rules.yml' } + notify: + - restart prometheus + - restart alertmanager + +- name: Enable Prometheus configuration + tags: prometheus-configure + ansible.builtin.template: + src: prometheus.config.j2 + dest: '{{ prometheus_config }}' + validate: promtool check config %s + notify: restart prometheus + +- name: Enable Prometheus service + tags: prometheus-enable + ansible.builtin.service: + name: '{{ prometheus_service }}' + state: started + enabled: yes + +- name: Enable Alertmanager service + tags: alertmanager-enable + ansible.builtin.service: + name: '{{ alertmanager_service }}' + state: started + enabled: yes diff --git a/ansible/roles/prometheus/templates/alertmanager.yml.j2 b/ansible/roles/prometheus/templates/alertmanager.yml.j2 new file mode 100644 index 0000000..7934f50 --- /dev/null +++ b/ansible/roles/prometheus/templates/alertmanager.yml.j2 @@ -0,0 +1,66 @@ +{{ ansible_managed | comment }} +# See https://prometheus.io/docs/alerting/configuration/ for documentation. + +global: + # The smarthost and SMTP sender used for mail notifications. + smtp_smarthost: 'mail.labolyon.fr:587' + smtp_from: 'alerts@labolyon.fr' + smtp_auth_username: 'alerts@labolyon.fr' + smtp_auth_password: {{ lookup('community.general.passwordstore', 'monitoring/e-mail/alerts@labolyon.fr')}} + +# The directory from which notification templates are read. +templates: +- '/etc/prometheus/alertmanager_templates/*.tmpl' + +# The root route on which each incoming alert enters. +route: + # The labels by which incoming alerts are grouped together. For example, + # multiple alerts coming in for cluster=A and alertname=LatencyHigh would + # be batched into a single group. + # + # To aggregate by all possible labels use '...' as the sole label name. + # This effectively disables aggregation entirely, passing through all + # alerts as-is. This is unlikely to be what you want, unless you have + # a very low alert volume or your upstream notification system performs + # its own grouping. Example: group_by: [...] + group_by: ['alertname', 'cluster', 'service'] + + # When a new group of alerts is created by an incoming alert, wait at + # least 'group_wait' to send the initial notification. + # This way ensures that you get multiple alerts for the same group that start + # firing shortly after another are batched together on the first + # notification. + group_wait: 30s + + # When the first notification was sent, wait 'group_interval' to send a batch + # of new alerts that started firing for that group. + group_interval: 5m + + # If an alert has successfully been sent, wait 'repeat_interval' to + # resend them. + repeat_interval: 3h + + # A default receiver + receiver: all-admins-email + + # All the above attributes are inherited by all child routes and can + # overwritten on each. + +# Inhibition rules allow to mute a set of alerts given that another alert is +# firing. +# We use this to mute any warning-level notifications if the same alert is +# already critical. +inhibit_rules: + - source_matchers: [severity="critical"] + target_matchers: [severity="warning"] + # Apply inhibition if the alertname is the same. + # CAUTION: + # If all label names listed in `equal` are missing + # from both the source and target alerts, + # the inhibition rule will apply! + equal: [alertname, cluster, service] + +receivers: + - name: 'all-admins-email' + email_configs: + - to: 'mirsal@mirsal.fr'