add monitoring, manual dns

This commit is contained in:
o.vodianov 2025-08-31 14:43:33 +04:00
parent 47d9aaf150
commit 6290d1ec1f
24 changed files with 550 additions and 2 deletions

7
ansible/dns-manual.yml Normal file
View File

@ -0,0 +1,7 @@
---
# to update only dns zones run the following:
# ansible-playbook -i hosts dns-manual.yml --diff -t manual
- hosts: dns
become: true
roles:
- dns

6
ansible/dns.yml Normal file
View File

@ -0,0 +1,6 @@
---
# ansible-playbook -i hosts dns.yml
- hosts: dns
become: true
roles:
- dns

View File

@ -0,0 +1,31 @@
---
monitoring_base_dir: "/opt/monitoring"
monitoring_docker_network: "monitoring"
# Версии образов
images:
prometheus: "prom/prometheus:v2.53.0"
alertmanager: "prom/alertmanager:v0.27.0"
loki: "grafana/loki:2.9.8"
grafana: "grafana/grafana:11.1.0"
promtail: "grafana/promtail:2.9.8"
# Retention / настройки
prometheus_retention: "30d"
loki_retention_bytes: 107374182400 # ~100GiB
# Alertmanager → Telegram (храните в vault)
telegram_bot_token: "VAULT_TELEGRAM_BOT_TOKEN"
telegram_chat_id: "VAULT_TELEGRAM_CHAT_ID"
# Loki URL для агентов
loki_http_url: "http://monitor1:3100"
# Список таргетов Prometheus (минимальный пример)
prometheus_targets:
node_exporter:
- "pve1:9100"
- "pve2:9100"
- "vm1:9100"
blackbox_http: # примеры, если позже добавите blackbox
- "https://example.com/health"

11
ansible/nas.yml Normal file
View File

@ -0,0 +1,11 @@
---
# if there is only password auth set on a server then run the following
# ansible-playbook -i hosts nas.yml --diff -Kk
- hosts: nas
become: true
vars_files:
- /Users/o.vodianov/Documents/home.lab.local/vars.yml
roles:
- nas
- sudoers
- ssh

View File

@ -0,0 +1,26 @@
;
; BIND reverse data file for local loopback interface
;
$TTL 604800
@ IN SOA ns1.home.lab.local. admin.home.lab.local. (
2025082002 ; Serial
3600 ; Refresh
3600 ; Retry
2419200 ; Expire
3600 ) ; Negative Cache TTL
;
; name servers - NS records
IN NS ns1.home.lab.local.
; PTR Records
114.0 IN PTR ns1.home.lab.local. ; 192.168.0.114
117.0 IN PTR pve.home.lab.local. ; 192.168.0.117
120.0 IN PTR pve2.home.lab.local. ; 192.168.0.120
113.0 IN PTR pve3.home.lab.local. ; 192.168.0.113
118.0 IN PTR torrent.home.lab.local. ; 192.168.0.118
118.0 IN PTR dlna.home.lab.local. ; 192.168.0.118
118.0 IN PTR microbin.home.lab.local. ; 192.168.0.118
114.0 IN PTR vpn.home.lab.local. ; 192.168.0.114
100.0 IN PTR nas.home.lab.local. ; 192.168.0.100
229.0 IN PTR libre.home.lab.local. ; 192.168.0.229
229.0 IN PTR monitoring.home.lab.local. ; 192.168.0.235

View File

@ -0,0 +1,29 @@
;
; BIND data file for local loopback interface
;
$TTL 604800
@ IN SOA ns1.home.lab.local. admin.home.lab.local. (
2025082002 ; Serial
3600 ; Refresh
3600 ; Retry
2419200 ; Expire
3600 ) ; Negative Cache TTL
;
; name servers - NS records
IN NS ns1.home.lab.local.
; name servers - A records
ns1.home.lab.local. IN A 192.168.0.114
; 10.128.0.0/16 - A records
pve.home.lab.local. IN A 192.168.0.117
pve2.home.lab.local. IN A 192.168.0.120
pve3.home.lab.local. IN A 192.168.0.113
torrent.home.lab.local. IN A 192.168.0.118
dlna.home.lab.local. IN A 192.168.0.118
microbin.home.lab.local. IN A 192.168.0.118
vpn.home.lab.local. IN A 192.168.0.114
nas.home.lab.local. IN A 192.168.0.100
libre.home.lab.local. IN A 192.168.0.229
monitoring.home.lab.local. IN A 192.168.0.235

View File

@ -0,0 +1,18 @@
providers:
yaml_source:
class: octodns_bind.ZoneFileSource
directory: ./zones # where your editable zone YAML lives
file_extension: .yaml
axfr:
class: octodns_bind.Rfc2136Provider
# The address of nameserver to perform zone transfer against
host: localhost
# The port that the nameserver is listening on. Optional. Default: 53
zones:
home.lab.local.:
sources:
- yaml_source
targets:
- axfr

View File

@ -0,0 +1,19 @@
---
'':
type: A
values:
- 1.2.3.4
- 1.2.3.5
'*':
type: CNAME
value: www.home.lab.local.
www:
type: A
values:
- 1.2.3.4
- 1.2.3.5
www.sub:
type: A
values:
- 1.2.3.6
- 1.2.3.7

View File

@ -0,0 +1,7 @@
---
- name: restart bind9
service:
name: bind9
state: restarted
tags:
- manual

View File

@ -0,0 +1,48 @@
---
- name: install packages
apt:
name: "{{ item }}"
state: present
update_cache: yes
loop:
- python3.12
- python3.12-venv
- python3-pip
- python3-virtualenv
- bind9
- name: install octodns packages
pip:
name: "{{ item }}"
virtualenv: ~/.venv
loop:
- octodns
- octodns-bind
- name: create directory for output files of octodns
file:
path: "{{ item }}"
state: directory
loop:
- /srv/octodns-config/zones
- /srv/octodns-config/bind
- name: copy octodns files
copy:
src: "{{ item.src }}"
dest: "{{ item.dst }}"
loop:
- { src: octodns-config/config.yaml, dst: /srv/octodns-config }
- { src: zones/home.lab.local.yaml, dst: /srv/octodns-config/zones }
- name: copy files
copy:
src: "manual/{{ item.src }}"
dest: "/etc/bind/zones/{{ item.dest }}"
owner: root
loop:
- { src: 'db.home.lab.local', dest: 'db.home.lab.local' }
- { src: 'db.192.168', dest: 'db.192.168' }
notify: restart bind9
tags:
- manual

View File

@ -0,0 +1,41 @@
---
- name: Install prerequisites
ansible.builtin.package:
name:
- ca-certificates
- curl
- gnupg
- lsb-release
state: present
- name: Add Docker official GPG key (Debian/Ubuntu)
ansible.builtin.apt_key:
url: https://download.docker.com/linux/{{ ansible_distribution | lower }}/gpg
state: present
when: ansible_os_family == 'Debian'
- name: Add Docker repo (Debian/Ubuntu)
ansible.builtin.apt_repository:
repo: "deb [arch={{ ansible_architecture }}] https://download.docker.com/linux/{{ ansible_distribution | lower }} {{ ansible_distribution_release }} stable"
state: present
when: ansible_os_family == 'Debian'
- name: Install Docker Engine & Compose plugin
ansible.builtin.package:
name:
- docker-ce
- docker-ce-cli
- containerd.io
- docker-buildx-plugin
- docker-compose-plugin
state: present
- name: Install Docker Python SDK for Ansible modules
ansible.builtin.pip:
name: docker
- name: Ensure docker service running
ansible.builtin.service:
name: docker
state: started
enabled: true

View File

@ -0,0 +1,10 @@
---
groups:
- name: blackbox.rules
rules:
- alert: HTTPDown
expr: probe_success{job="blackbox"} == 0
for: 2m
labels: {severity: critical}
annotations:
summary: "HTTP цель {{ $labels.instance }} недоступна"

View File

@ -0,0 +1,8 @@
---
groups:
- name: recording.rules
rules:
- record: job:node_cpu_util:avg5m
expr: avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[5m]))
- record: instance:disk_used_perc
expr: 1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)

View File

@ -0,0 +1,24 @@
---
groups:
- name: system.rules
rules:
- alert: HostDown
expr: up == 0
for: 2m
labels: {severity: critical}
annotations:
summary: "Хост {{ $labels.instance }} недоступен"
- alert: HighCPULoad
expr: avg by (instance)(rate(node_cpu_seconds_total{mode!="idle"}[5m])) > 0.90
for: 10m
labels: {severity: warning}
annotations:
summary: "Высокая загрузка CPU на {{ $labels.instance }}"
- alert: LowDiskSpace
expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) < 0.10
for: 5m
labels: {severity: critical}
annotations:
summary: "Мало места на {{ $labels.instance }} {{ $labels.mountpoint }}"

View File

@ -0,0 +1,89 @@
---
- name: Create directories
ansible.builtin.file:
path: "{{ item }}"
state: directory
mode: '0755'
loop:
- "{{ monitoring_base_dir }}"
- "{{ monitoring_base_dir }}/prometheus"
- "{{ monitoring_base_dir }}/alertmanager"
- "{{ monitoring_base_dir }}/loki"
- "{{ monitoring_base_dir }}/grafana/provisioning/datasources"
- "{{ monitoring_base_dir }}/rules"
- name: Copy example alert/recording rules
ansible.builtin.copy:
src: "roles/monitoring_stack/files/rules/"
dest: "{{ monitoring_base_dir }}/rules/"
mode: '0644'
- name: Render Prometheus config
ansible.builtin.template:
src: templates/prometheus.yml.j2
dest: "{{ monitoring_base_dir }}/prometheus/prometheus.yml"
mode: '0644'
notify: Restart monitoring stack
- name: Render Alertmanager config
ansible.builtin.template:
src: templates/alertmanager.yml.j2
dest: "{{ monitoring_base_dir }}/alertmanager/alertmanager.yml"
mode: '0640'
notify: Restart monitoring stack
- name: Render Loki config
ansible.builtin.template:
src: templates/loki-config.yml.j2
dest: "{{ monitoring_base_dir }}/loki/config.yml"
mode: '0644'
notify: Restart monitoring stack
- name: Render Grafana provisioning (datasources)
ansible.builtin.template:
src: templates/grafana-provisioning-datasources.yml.j2
dest: "{{ monitoring_base_dir }}/grafana/provisioning/datasources/datasources.yml"
mode: '0644'
notify: Restart monitoring stack
- name: Render docker-compose.yml
ansible.builtin.template:
src: templates/docker-compose.yml.j2
dest: "{{ monitoring_base_dir }}/docker-compose.yml"
mode: '0644'
notify: Restart monitoring stack
- name: Ensure Docker network exists
community.docker.docker_network:
name: "{{ monitoring_docker_network }}"
state: present
- name: Launch/Update monitoring stack via Compose v2
community.docker.docker_compose_v2:
project_src: "{{ monitoring_base_dir }}"
state: present
- name: Wait for Grafana to be reachable
ansible.builtin.uri:
url: http://localhost:3000/api/health
status_code: 200
validate_certs: false
timeout: 30
retries: 10
delay: 3
method: GET
return_content: false
register: gf_health
until: gf_health is succeeded
- name: Show Grafana URL
ansible.builtin.debug:
msg: "Grafana is up: http://{{ ansible_host | default(inventory_hostname) }}:3000"
# Handlers
- name: Restart monitoring stack
listen: Restart monitoring stack
community.docker.docker_compose_v2:
project_src: "{{ monitoring_base_dir }}"
state: present
pull: always

View File

@ -0,0 +1,23 @@
route:
receiver: 'telegram'
group_by: ['alertname','instance']
group_wait: 30s
group_interval: 5m
repeat_interval: 12h
receivers:
- name: 'telegram'
telegram_configs:
- bot_token: "${TELEGRAM_BOT_TOKEN}"
chat_id: "${TELEGRAM_CHAT_ID}"
message: |
[{{ '{{' }} .Status | toUpper {{ '}}' }}] {{ '{{' }} .CommonLabels.alertname {{ '}}' }}
{{ '{{' }}- range .Alerts {{ '}}' }}
• {{ '{{' }} .Annotations.summary {{ '}}' }}
Labels: {{ '{{' }} .Labels {{ '}}' }}
{{ '{{' }}- end {{ '}}' }}
inhibit_rules:
- source_matchers: [ severity="critical" ]
target_matchers: [ severity="warning" ]
equal: [ 'alertname','instance' ]

View File

@ -0,0 +1,61 @@
version: "3.9"
services:
prometheus:
image: {{ images.prometheus }}
command:
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.retention.time={{ prometheus_retention }}
volumes:
- {{ monitoring_base_dir }}/prometheus:/etc/prometheus:ro
- prom_data:/prometheus
ports:
- "9090:9090"
networks: [ {{ monitoring_docker_network }} ]
restart: unless-stopped
alertmanager:
image: {{ images.alertmanager }}
command: ["--config.file=/etc/alertmanager/alertmanager.yml"]
volumes:
- {{ monitoring_base_dir }}/alertmanager:/etc/alertmanager:ro
ports:
- "9093:9093"
networks: [ {{ monitoring_docker_network }} ]
restart: unless-stopped
environment:
TELEGRAM_BOT_TOKEN: "{{ telegram_bot_token }}"
TELEGRAM_CHAT_ID: "{{ telegram_chat_id }}"
loki:
image: {{ images.loki }}
command: ["-config.file=/etc/loki/config.yml"]
volumes:
- {{ monitoring_base_dir }}/loki:/etc/loki:ro
- loki_data:/loki
ports:
- "3100:3100"
networks: [ {{ monitoring_docker_network }} ]
restart: unless-stopped
grafana:
image: {{ images.grafana }}
volumes:
- grafana_data:/var/lib/grafana
- {{ monitoring_base_dir }}/grafana/provisioning:/etc/grafana/provisioning:ro
ports:
- "3000:3000"
networks: [ {{ monitoring_docker_network }} ]
restart: unless-stopped
environment:
GF_SECURITY_ADMIN_USER: admin
GF_SECURITY_ADMIN_PASSWORD: admin
GF_SERVER_ROOT_URL: http://{{ ansible_host | default(inventory_hostname) }}:3000
volumes:
prom_data: {}
loki_data: {}
grafana_data: {}
networks:
{{ monitoring_docker_network }}:
external: true

View File

@ -0,0 +1,14 @@
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: true
- name: Loki
type: loki
access: proxy
url: http://loki:3100
editable: true

View File

@ -0,0 +1,40 @@
server:
http_listen_port: 3100
grpc_listen_port: 9096
common:
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1
ring:
instance_addr: 127.0.0.1
kvstore:
store: inmemory
schema_config:
configs:
- from: 2024-01-01
store: boltdb-shipper
object_store: filesystem
schema: v13
index:
prefix: index_
period: 24h
limits_config:
ingestion_rate_mb: 16
max_global_streams_per_user: 20000
retention_period: 0s
retention_stream:
- selector: '{job="syslog"}'
priority: 1
period: 720h # 30d
- selector: '{job="pve"}'
priority: 1
period: 2160h # 90d
chunk_store_config:
max_look_back_period: 720h
compactor:
working_directory: /loki/compactor
shared_store: filesystem
retention_enabled: true

View File

@ -0,0 +1,31 @@
global:
scrape_interval: 15s
evaluation_interval: 30s
rule_files:
- /etc/prometheus/../rules/*.yml
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager:9093"]
scrape_configs:
- job_name: 'node_exporter'
static_configs:
- targets: {{ prometheus_targets.node_exporter | to_nice_json }}
# Раскомментируйте, если добавите blackbox в этот же compose
# - job_name: 'blackbox'
# metrics_path: /probe
# params:
# module: [http_2xx]
# static_configs:
# - targets: {{ prometheus_targets.blackbox_http | default([]) | to_nice_json }}
# relabel_configs:
# - source_labels: [__address__]
# target_label: __param_target
# - source_labels: [__param_target]
# target_label: instance
# - target_label: __address__
# replacement: blackbox:9115

View File

@ -0,0 +1,5 @@
- name: create a user {{ nas_user_login }}
ansible.builtin.user:
name: "{{ nas_user_login }}"
password: "{{ nas_user_pass }}"
create_home: yes

View File

@ -23,4 +23,4 @@ $TTL 604800
114.0 IN PTR vpn.home.lab.local. ; 192.168.0.114
100.0 IN PTR nas.home.lab.local. ; 192.168.0.100
229.0 IN PTR libre.home.lab.local. ; 192.168.0.229
229.0 IN PTR monitoring.home.lab.local. ; 192.168.0.235
229.0 IN PTR monitoring.home.lab.local. ; 192.168.0.235

View File

@ -26,4 +26,4 @@ microbin.home.lab.local. IN A 192.168.0.118
vpn.home.lab.local. IN A 192.168.0.114
nas.home.lab.local. IN A 192.168.0.100
libre.home.lab.local. IN A 192.168.0.229
monitoring.home.lab.local. IN A 192.168.0.235
monitoring.home.lab.local. IN A 192.168.0.235