From ddc91059b5f1a03503571da8b5a224b710517392 Mon Sep 17 00:00:00 2001 From: Jake Howard Date: Tue, 31 Jan 2023 09:11:09 +0000 Subject: [PATCH] Replace telegraf with prometheus exporters Still missing SMART and ping, but those can come later. Swapped as the polling model for prometheus doesn't play especially well with telegraf, and leads to gaps in data --- ansible/galaxy-requirements.yml | 5 +- ansible/group_vars/all/node_exporter.yml | 3 + ansible/main.yml | 3 +- .../files/prometheus/docker-compose.yml | 10 ++ .../forrest/files/prometheus/prometheus.yml | 26 +++- ansible/roles/forrest/vars/main.yml | 1 + ansible/roles/forrest/vars/vault.yml | 99 +++++++------- ansible/roles/telegraf/files/telegraf.conf | 122 ------------------ ansible/roles/telegraf/tasks/main.yml | 10 -- ansible/roles/telegraf/vars/main.yml | 11 -- 10 files changed, 89 insertions(+), 201 deletions(-) create mode 100644 ansible/group_vars/all/node_exporter.yml delete mode 100644 ansible/roles/telegraf/files/telegraf.conf delete mode 100644 ansible/roles/telegraf/tasks/main.yml delete mode 100644 ansible/roles/telegraf/vars/main.yml diff --git a/ansible/galaxy-requirements.yml b/ansible/galaxy-requirements.yml index d61c3c7..e3d2b36 100644 --- a/ansible/galaxy-requirements.yml +++ b/ansible/galaxy-requirements.yml @@ -2,7 +2,8 @@ collections: - ansible.posix - community.general - community.docker - - amazon.aws # Dependency of rossmcdonald.telegraf + - name: https://github.com/prometheus-community/ansible + type: git roles: - src: geerlingguy.docker @@ -13,8 +14,6 @@ roles: - src: ironicbadger.proxmox_nag_removal version: 1.0.1 - src: chmduquesne.iptables_persistent - - src: rossmcdonald.telegraf - version: v1.2.0 - src: geerlingguy.gitlab version: 3.2.0 - src: dokku_bot.ansible_dokku diff --git a/ansible/group_vars/all/node_exporter.yml b/ansible/group_vars/all/node_exporter.yml new file mode 100644 index 0000000..bc59928 --- /dev/null +++ b/ansible/group_vars/all/node_exporter.yml @@ -0,0 +1,3 @@ +node_exporter_version: 1.5.0 +node_exporter_web_listen_address: "{{ private_ip }}:9100" +node_exporter_enabled_collectors: [] # Disable the systemd collector by default diff --git a/ansible/main.yml b/ansible/main.yml index c403704..e20f7be 100644 --- a/ansible/main.yml +++ b/ansible/main.yml @@ -85,9 +85,10 @@ become: true - zfs - pve_nebula_route - - telegraf - role: ironicbadger.snapraid become: true + - role: prometheus.prometheus.node_exporter + become: true - hosts: forrest roles: diff --git a/ansible/roles/forrest/files/prometheus/docker-compose.yml b/ansible/roles/forrest/files/prometheus/docker-compose.yml index eaf1392..0cc8a7c 100644 --- a/ansible/roles/forrest/files/prometheus/docker-compose.yml +++ b/ansible/roles/forrest/files/prometheus/docker-compose.yml @@ -38,6 +38,16 @@ services: volumes: - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro + proxmox_exporter: + image: prompve/prometheus-pve-exporter:latest + restart: unless-stopped + user: "{{ docker_user.id }}" + environment: + - PVE_USER=prometheus@pve + - PVE_TOKEN_NAME=prometheus + - PVE_TOKEN_VALUE={{ prometheus_api_token }} + - PVE_VERIFY_SSL=false + networks: grafana: external: true diff --git a/ansible/roles/forrest/files/prometheus/prometheus.yml b/ansible/roles/forrest/files/prometheus/prometheus.yml index 26bcf87..f85d055 100644 --- a/ansible/roles/forrest/files/prometheus/prometheus.yml +++ b/ansible/roles/forrest/files/prometheus/prometheus.yml @@ -5,14 +5,18 @@ alerting: - alertmanager:9093 scrape_configs: - - job_name: telegraf - metric_relabel_configs: - - source_labels: [__name__] - regex: go_.+ - action: drop + - job_name: pve static_configs: - targets: - - "{{ pve_hosts.pve.ip }}:9273" + - "{{ pve_hosts.pve.ip }}" + metrics_path: /pve + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: proxmox_exporter:9221 - job_name: traefik metric_relabel_configs: @@ -102,3 +106,13 @@ scrape_configs: metrics_path: /projects/{{ healthchecks_project_uuid }}/metrics/{{ healthcheck_api_token }} static_configs: - targets: [healthchecks.io] + + - job_name: node + metrics_path: /metrics + static_configs: + - targets: + - "{{ pve_hosts.pve.ip }}:9100" + metric_relabel_configs: + - source_labels: [__name__] + regex: go_.+ + action: drop diff --git a/ansible/roles/forrest/vars/main.yml b/ansible/roles/forrest/vars/main.yml index b5c1f28..f4edac7 100644 --- a/ansible/roles/forrest/vars/main.yml +++ b/ansible/roles/forrest/vars/main.yml @@ -9,3 +9,4 @@ healthcheck_api_token: "{{ vault_healthcheck_api_token }}" alertmanager_from_address: "{{ vault_alertmanager_from_address }}" alertmanager_smtp_password: "{{ vault_alertmanager_smtp_password }}" alertmanager_to_address: "{{ vault_alertmanager_to_address }}" +prometheus_api_token: "{{ vault_prometheus_api_token }}" diff --git a/ansible/roles/forrest/vars/vault.yml b/ansible/roles/forrest/vars/vault.yml index 9c8b0d5..984ac21 100644 --- a/ansible/roles/forrest/vars/vault.yml +++ b/ansible/roles/forrest/vars/vault.yml @@ -1,49 +1,52 @@ $ANSIBLE_VAULT;1.1;AES256 -61666339653630373931366533656437386337663739313532663339356562656466373461303837 -3132343434633835333035303863346532363137656565380a646239326534346135333737653337 -31626663636661396462353765653830396132306531393965306439316437623866306166656430 -3664373263643666350a353139653633663233623064353532313136393865333763353233343132 -61363864666534636630623639346361656361633235313535323030633434656136626131656237 -62633433613537363431336664313166306539616431626131653337363536633536386561373630 -32323165393538636437653361363766636139646166646538666462333337333837383966643263 -31646562316433323962353439353636393965393962633665306232653737353233346632326233 -36333863623933313238663434643737363638666662323238643238326165396433653462663934 -62663538366633386533316432323535656563346435353665333430623434616634393565633962 -65383262356333643437636531663034303933313534393965633739633031393139386633376134 -35653164313830643065613439663462306131383135366363653834386230363639366233643866 -30626638306666653065353237663434633333376337343133656237666662343163656164353935 -39386262303664653164356431643466306463343564633064376532663133393432626333393236 -39306463663963383234363664643265393434363138373062643165313332643833663464343766 -33393162336535303136653230633135636266653863646331346630386131303131333636613938 -66386331643966313236356634623938363835323439343234613164616132616137353237356162 -66333964363361353436396634353735663963643239356164333434373166623731643038656136 -32376339316434373836333532373664323762636634336361616437306132356637616162303231 -30356263366137363337316631643536643539653237636362386366656434353861343239306431 -33326163336536636233643434633461626430313536376632623334303938316364316162386365 -38393138356538623235663539393166306366396364306335656534366663616336666637323138 -32343035373063306462636163616364616332333263663737653235613230313465363466616565 -36373831383364333837303433663839353363663730333665663138386635626530323633626661 -62633066633361366562383737646565613566653866663136326135383533303962326637316132 -39313832303538363432646432616433303966626133653461363530623966363133646530353931 -62646334646336633162633763613333663937636262633138323932313632653630393162363439 -37343739313066343831623265353164386239373430623735343739373737373130643631343664 -31626633303764633831646565653732326363336431353433353365633433626665353733663336 -32643132666565373332323136386630633561656632613335613965343439633831336336306464 -62303234646233633366383430366431336430356463643630623566373333646532333663623466 -63326364393166393461653366373634626333623836303466636136663331383239336432363661 -62383836653562633765333034306161326539396330663831363135326235343535393663363638 -36616130663538376534616161393930633462316539643561323665313339646234306232656662 -30356139303465313561636433646162346136303632653230353437383063396438616536376533 -30363362613635336162336537333932306666666632336664613631623830616538633131663365 -36656164623139643333333237376333656632306436313865656337616439343434383231616561 -32323232306437353034643735356532653233306131356632393963616337303835663632653837 -63656433633061313034656565373061643132623163626230386561323466353534323734326437 -34336439363563373637336139376261376439306166656337663366383463313938333731306664 -34393062653536346464373637343232666465336564613235323931373465323864386362336561 -37633837336136373030346533633562343439316264333462623763666162373938326437353638 -63343062313732356338623731626232626265366466623437383734313630393030366330353864 -63613766373438366564663761623766353636303066343665353437643830626364663031336537 -36663739303232363162646464663339623939363733393835383366636463643633396362363137 -62613138656563666661363630633935346433356235303936356431343637663938646161343266 -61313861346562396439383262396435313064643264353536326238623461653965626334653866 -36646239313666396338 +36623535313964653161353330663436356239613837653837393939373034353031646535333535 +6439313832316239616233306632373934616134616466380a316361363263373938636161666535 +31613461333637373732626233623434316335353964353433643635653566613933393361336139 +3864373963396264320a376634346331373762313733323961386566646338633936303631303566 +66616534326430653266396635353932623661363533356537636662636537656434363562646230 +30613831336561376639393466373739373138313931333163353061633465623362666564313631 +66623235353531613737643937613430323934376433393836346339626137616561313062663234 +63363736326439623661376132613136383465393761653236663631613339653066356436653630 +66623865303735616335373231643233386639323838353534613337316161633765396234366533 +33616631663530643764373937346262633734366339303837393737666665363465333239343933 +35613962396534336232623833303034643639323931633966396439383463396261313862626335 +31323434613838353961336136613966636635646632393839663664376632373834313265643338 +30663132633362323831313231333164643665386535323231646262656631383631393539616639 +34343563353064303833383236626136666264316236316537333965313162616637323966363335 +32353936663162316564306337353861396634353935353935306135343665316262643831396537 +61393266383538666563363261646534636632303332343662636631316663343930303766623638 +35376565343638316339623061396536643636313966383633346231633631353032356661386132 +66623439336338616666626431303635373833666137326234653161336434346133636261363662 +39313732303736386137656664303365363234336265643064306562643435633838373864353862 +33366635333630373162656630666232333563623066333461653963363961623435646631373561 +64643738346138366566303233326663383835386132663034313461383161616164636332396332 +37663131386135393833373461663432666264363065666630646164633134303439663435616235 +35656234313761376532306264393637653433623863383830323935316332383338623134323366 +31336665386137323132363962363335623635336131373930353635353663333366363266303138 +35626262613261636561373730626635303836623561643436646430653365663432323938393863 +63633331663462323163646237386262376337313330323036613434383165616530643362616131 +63616562353964316634646434653138333266646633616631653663663838306163616633643234 +61333230373237613436343662363434303766383336376232353066313231666330613761643366 +36326638326439653966643430313366376661633636366565393461623438323366373333663633 +61633763623631333665363333646433656166633364303836623566333336343761613435353138 +37366165613263653564386334303030623333646164303662363065333831376334656537613130 +33373864663237383064653461616165653834393063663332643235316139333539623463343161 +38636564626466633631393938653066373764663935353763626133623762306164383831663061 +34333065326666373337663931313763383739383763333235333939376133363236643136346233 +62643833376631643036613963643939333133343036613332313866373032646332363231313139 +61373365653665343066636162356336373833393363373866343436323639623435383831363335 +30333033326638363930613030356664333233633339666366643062353634333161343838666231 +32346332663538653937623136653438636463323463376263303962353562313833373937303066 +65303037323030653434313164393766633134306435633263363335636561356264376665363639 +35613731373437386566663266656266343639326334303239613862353963323436633836383766 +35323930633039396535616265643234303639393035363865643236623838333337626135343665 +36373038666332376663333565623362303631663830336131343438353764653831633433363436 +36333839303433623966363561313564303037393165383732323763353232653564346138666438 +30653836626139356133346538616135313034633966373036303461393562363336386633626365 +33393565643730383634346238356462313435366538636234656237613864656165656439363061 +32626235323362333239373631383830653035383164646364343461376562636564343063353139 +61306535333466653937303635353962376162376431336563316130343530636431623537633332 +65373333376338353930316561636530343062653964323463653632653332376432343237656465 +63333437613064313438353134333566303033313339323162643061363836643931343135396130 +32623435653533326563616263323938343332306362383034663139653965626231336637383939 +313534343431303739396263303737303365 diff --git a/ansible/roles/telegraf/files/telegraf.conf b/ansible/roles/telegraf/files/telegraf.conf deleted file mode 100644 index bb555d3..0000000 --- a/ansible/roles/telegraf/files/telegraf.conf +++ /dev/null @@ -1,122 +0,0 @@ -# Global tags can be specified here in key="value" format. -[global_tags] - # dc = "us-east-1" # will tag all metrics with dc=us-east-1 - # rack = "1a" - ## Environment variables can be used as tags, and throughout the config file - # user = "$USER" - - -# Configuration for telegraf agent -[agent] - ## Default data collection interval for all inputs - interval = "60s" - ## Rounds collection interval to 'interval' - ## ie, if interval="10s" then always collect on :00, :10, :20, etc. - round_interval = true - - ## Telegraf will send metrics to outputs in batches of at most - ## metric_batch_size metrics. - ## This controls the size of writes that Telegraf sends to output plugins. - metric_batch_size = 1000 - - ## Maximum number of unwritten metrics per output. Increasing this value - ## allows for longer periods of output downtime without dropping metrics at the - ## cost of higher maximum memory usage. - metric_buffer_limit = 10000 - - ## Collection jitter is used to jitter the collection by a random amount. - ## Each plugin will sleep for a random time within jitter before collecting. - ## This can be used to avoid many plugins querying things like sysfs at the - ## same time, which can have a measurable effect on the system. - collection_jitter = "0s" - - ## Default flushing interval for all outputs. Maximum flush_interval will be - ## flush_interval + flush_jitter - flush_interval = "10s" - ## Jitter the flush interval by a random amount. This is primarily to avoid - ## large write spikes for users running a large number of telegraf instances. - ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s - flush_jitter = "0s" - - ## By default or when set to "0s", precision will be set to the same - ## timestamp order as the collection interval, with the maximum being 1s. - ## ie, when interval = "10s", precision will be "1s" - ## when interval = "250ms", precision will be "1ms" - ## Precision will NOT be used for service inputs. It is up to each individual - ## service input to set the timestamp at the appropriate precision. - ## Valid time units are "ns", "us" (or "µs"), "ms", "s". - precision = "" - - ## Log at debug level. - # debug = false - ## Log only error level messages. - # quiet = false - - ## Log target controls the destination for logs and can be one of "file", - ## "stderr" or, on Windows, "eventlog". When set to "file", the output file - ## is determined by the "logfile" setting. - # logtarget = "file" - - ## Name of the file to be logged to when using the "file" logtarget. If set to - ## the empty string then logs are written to stderr. - # logfile = "" - - ## The logfile will be rotated after the time interval specified. When set - ## to 0 no time based rotation is performed. Logs are rotated only when - ## written to, if there is no log activity rotation may be delayed. - # logfile_rotation_interval = "0d" - - ## The logfile will be rotated when it becomes larger than the specified - ## size. When set to 0 no size based rotation is performed. - # logfile_rotation_max_size = "0MB" - - ## Maximum number of rotated archives to keep, any older logs are deleted. - ## If set to -1, no archives are removed. - # logfile_rotation_max_archives = 5 - - ## Override default hostname, if empty use os.Hostname() - hostname = "{{ ansible_hostname }}" - ## If set to true, do no set the "host" tag in the telegraf agent. - omit_hostname = false - - -############################################################################### -# OUTPUT PLUGINS # -############################################################################### - -[[outputs.prometheus_client]] - listen = "{{ private_ip }}:9273" - metric_version = 2 - -############################################################################### -# INPUT PLUGINS # -############################################################################### - -[[inputs.ping]] - urls = ["192.168.1.1", "9.9.9.9"] - ping_interval = 60.0 - -[[inputs.smart]] - use_sudo = true - -[[inputs.proxmox]] - base_url = "https://{{ pve_hosts.pve.ip }}:8006/api2/json/" - api_token = "telegraf@pve!telegraf={{ proxmox_telegraf_token }}" - node_name = "pve" - insecure_skip_verify = true - -[[inputs.disk]] - ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"] - -[[inputs.system]] - -[[inputs.sensors]] - -[[inputs.nvidia_smi]] - -[[inputs.exec]] - command = "speedtest --json" - name_override = "speedtest" - timeout = "2m" - interval = "10m" - data_format = "json" diff --git a/ansible/roles/telegraf/tasks/main.yml b/ansible/roles/telegraf/tasks/main.yml deleted file mode 100644 index afc56ac..0000000 --- a/ansible/roles/telegraf/tasks/main.yml +++ /dev/null @@ -1,10 +0,0 @@ -- name: Install and configure telegraf - import_role: - name: rossmcdonald.telegraf - become: true - -- name: Let telegraf do smart stats - lineinfile: - path: /etc/sudoers - line: "{{ telegraf_runas_user }} ALL=(ALL) NOPASSWD: /usr/sbin/smartctl" - become: true diff --git a/ansible/roles/telegraf/vars/main.yml b/ansible/roles/telegraf/vars/main.yml deleted file mode 100644 index bd079b4..0000000 --- a/ansible/roles/telegraf/vars/main.yml +++ /dev/null @@ -1,11 +0,0 @@ -telegraf_configuration_template: files/telegraf.conf -telegraf_aws_tags: false - -proxmox_telegraf_token: !vault | - $ANSIBLE_VAULT;1.1;AES256 - 34646261333165343031323566643738353363393864363035303037653838383038363162346164 - 6239313461393366373534636539613639623061393231640a613837343731373462666662356563 - 34343934313165623433646335383064333136343237353030353532653631633635366135336134 - 3931336436656561350a376137636666633937636134663139326630623761386435653435653338 - 36643232396361313436666533373737626365633662383239623561373061313636366231636330 - 3539303065366366323137336432613165336462363330363735