by Nicolas Jeanselme
index=ib_dns_summary report=si_dns_member_qps_trend | stats count by orig_host orig_host orig_host Trinzic 810 Trinzic 815 Trinzic 820 Trinzic 825 Trinzic 1410 Trinzic 1415 Trinzic 1420/PT-1400 Trinzic 1425/PT-1405 Trinzic 2210 Trinzic 2215 Trinzic 2220/PT-2200 Trinzic 2225/PT-2205 Trinzic 4010/PT-4000 Trinzic 4030 4000 60 15000 105 30000 210 50000 300 61000 375 143000 600 6000 90 22500 150 45000 300 75000 450 90000 550 200000 900 200000 1020 1000000 0 -24h now
Performance Query per second (QPS) sourcetype=ib:dns:query:by_member index=ib_dns host="$grid_member_var$"| bucket span=10m _time | stats sum(QCOUNT) as QPM by _time | timechart bins=1000 eval(avg(QPM)/600) as QPS| interpolate 1200 $time.earliest$ $time.latest$ Cache hit ratio (CHR) % sourcetype=ib:dns:query:cache_hit_rate index=ib_dns host="$grid_member_var$" | eval PERCENT=if(HITS+MISSES > 0,(HITS*100/(HITS+MISSES)),0) | bucket span=10m _time | timechart bins=1000 avg(PERCENT) as CHR by display_name | interpolate 1200 $time.earliest$ $time.latest$ Latency in ms sourcetype=ib:dns:perf index=ib_dns host="$grid_member_var$"| bucket span=1m _time | timechart bins=1000 avg(LATENCY) as Latency_ms| interpolate 120 $time.earliest$ $time.latest$ Max QPS at 100% Cache Hit Ratio (CHR) index=* |head 1| eval max=$qps_threshold$|table max 0 QPS, CHR and Max QPS for this CHR comparison sourcetype=ib:dns:query:by_member index=ib_dns host="$grid_member_var$"| bucket span=10m _time | stats sum(QCOUNT) as QPM by _time | timechart bins=1000 eval(avg(QPM)/600) as QPS| interpolate 1200 | join _time [search sourcetype=ib:dns:query:cache_hit_rate index=ib_dns host="$grid_member_var$" | eval PERCENT=if(HITS+MISSES > 0,(HITS*100/(HITS+MISSES)),0) | bucket span=10m _time | timechart bins=1000 avg(PERCENT) as CHR | interpolate 1200] | eval Max_QPS=$qps_threshold$ | eval Max_QPS_for_this_CHR=100/((100-CHR)/($qps_threshold$/10)+CHR/$qps_threshold$) $time.earliest$ $time.latest$ DNS engine usage % (Max QPS for CHR vs QPS), CPU & CHR sourcetype=ib:dns:query:by_member index=ib_dns host="$grid_member_var$"| bucket span=10m _time | stats sum(QCOUNT) as QPM by _time | timechart bins=1000 eval(avg(QPM)/600) as QPS| interpolate 1200 | join _time [search sourcetype=ib:dns:query:cache_hit_rate index=ib_dns host="$grid_member_var$" | eval PERCENT=if(HITS+MISSES > 0,(HITS*100/(HITS+MISSES)),0) | bucket span=10m _time | timechart bins=1000 avg(PERCENT) as CHR | interpolate 1200| join _time [search sourcetype=ib:system index=ib_system " 1 " host="$grid_member_var$" | bucket span=10m _time | timechart bins=1000 avg(CPU_PERCENT) as CPU| interpolate 1200]] | eval Max_QPS=$qps_threshold$ | eval Max_QPS_for_this_CHR=100/((100-CHR)/($qps_threshold$/10)+CHR/$qps_threshold$) | eval DNS_engine_usage=(QPS*100/Max_QPS_for_this_CHR) | fields _time, DNS_engine_usage, CHR, CPU |forecast DNS_engine_usage future_timespan=90 as DNS_engine_prediction $time.earliest$ $time.latest$ Memory usage sourcetype=ib:system index=ib_system " 2 " host="$grid_member_var$" | timechart bins=1000 avg(MEMORY_PERCENT) as Mem $time.earliest$ $time.latest$ DNS Engine maximum load % sourcetype=ib:dns:query:by_member index=ib_dns host="$grid_member_var$"| bucket span=10m _time | stats sum(QCOUNT) as QPM by _time | timechart bins=1000 eval(avg(QPM)/600) as QPS| interpolate 1200 | join _time [search sourcetype=ib:dns:query:cache_hit_rate index=ib_dns (HITS=0 OR MISSES=0) host="$grid_member_var$" | eval PERCENT=if(HITS+MISSES > 0,(HITS*100/(HITS+MISSES)),0) | bucket span=10m _time | timechart bins=1000 avg(PERCENT) as CHR | interpolate 1200| join _time [search sourcetype=ib:system index=ib_system " 1 " host="$grid_member_var$" | bucket span=10m _time | timechart bins=1000 avg(CPU_PERCENT) as CPU| interpolate 1200]] | eval Max_QPS=$qps_threshold$ | eval Max_QPS_for_this_CHR=100/((100-CHR)/($qps_threshold$/10)+CHR/$qps_threshold$) | eval DNS_engine_usage=(QPS*100/Max_QPS_for_this_CHR) | stats max(DNS_engine_usage) $time.earliest$ $time.latest$ DNS Indicators - Top Top 10 DNS Clients index=ib_dns_summary report=si_dns_top_clients orig_host="$grid_member_var$" | stats sum(COUNT) as CLIENT_QUERIES by CLIENT | sort -CLIENT_QUERIES | eventstats sum(CLIENT_QUERIES) as TOTAL | eval percent=round(CLIENT_QUERIES*100/TOTAL,6) | rename CLIENT as Client, CLIENT_QUERIES as count | fields Client, count, percent | head 10 $time.earliest$ $time.latest$
Top 10 Requested FQDN index=ib_dns_summary report=si_dns_requested_domain orig_host="$grid_member_var$" | stats sum(COUNT) as FQDN_TOTAL by FQDN | sort -FQDN_TOTAL | eventstats sum(FQDN_TOTAL) as TOTAL | eval percent=round(FQDN_TOTAL*100/TOTAL, 6) | rename FQDN_TOTAL as count, FQDN as "Domain Name" | fields "Domain Name", count, percent| head 10 $time.earliest$ $time.latest$
Top 10 Requested FQDN not in Alexa 2000 index=ib_dns_summary report=si_dns_requested_domain orig_host="$grid_member_var$" | lookup alexa2000global.csv fqdn as FQDN OUTPUTNEW fqdn AS match | where isnull(match)| stats sum(COUNT) as FQDN_TOTAL by FQDN | sort -FQDN_TOTAL | rename FQDN_TOTAL as count, FQDN as "Domain Name" | head 10 $time.earliest$ $time.latest$
DNS Engine Indicators Recursion client quota index=ib_syslog "Recursion client quota" host="$grid_member_var$"| timechart avg(used) as used ,avg(max) as max ,avg(soft_limit) as soft_limit ,avg(s_over) as s_over ,avg(hard_limit) as hard_limit ,avg(h_over) as h_over ,avg(low_pri) as low_pri $time.earliest$ $time.latest$ Clients per Query index=ib_syslog "clients per query" host="$grid_member_var$"| timechart avg(limit) as limit ,avg(max) as max , avg(avg) as avg, avg(soft_limit) as soft_limit ,avg(limit_over) as limit_over ,avg(hard_limit) as hard_limit ,avg(h_over) as h_over ,avg(est_max_req) as "est_max_req" $time.earliest$ $time.latest$ DNS recursive cache size index=ib_syslog "Recursion cache view" host="$grid_member_var$" | timechart avg(size) as size $time.earliest$ $time.latest$ Recursion quota reached index=ib_syslog "quota reached" host="$grid_member_var$"| timechart count $time.earliest$ $time.latest$ Top 10 client IP reached quota index=ib_syslog "quota reached" host="$grid_member_var$"| top client_ip limit=10 $time.earliest$ $time.latest$
Top 10 FQDN quota reached index=ib_syslog "quota reached" host="$grid_member_var$"| top fqdn limit=10 $time.earliest$ $time.latest$
DNS engine messages by severity over time index=ib_syslog named process=named host="$grid_member_var$"| timechart count by severity $time.earliest$ $time.latest$ DNS engine messages by severity index=ib_syslog named process=named host="$grid_member_var$"| top limit=8 severity $time.earliest$ $time.latest$ Top 5 DNS engine messages by severity index=ib_syslog named process=named host="$grid_member_var$"| top limit=5 message by severity $time.earliest$ $time.latest$
RPZ feed update status index=ib_syslog host="$grid_member_var$" named rpz zone NOT DBRef | rex "zone '?(?<rpz_zone>[^ '/]*)" | eval mytime=strftime(_time, "%d-%m-%Y %H:%M:%S") | stats latest(mytime) as time, latest(message) by rpz_zone | sort -time | rename rpz_zone AS "RPZ Zone", latest(message) as "Last message" $time.earliest$ $time.latest$
DNS problem indicators Request time-outs index=ib_dns_summary report=si_top_timeout_queries orig_host="$grid_member_var$"| timechart sum(COUNT) as Timeouts $time.earliest$ $time.latest$ Top 10 time-outs domains index=ib_dns_summary report=si_top_timeout_queries orig_host="$grid_member_var$"| stats sum(COUNT) as SFT_QUERIES by NAME | sort -SFT_QUERIES | eventstats sum(SFT_QUERIES) as COUNT_SUM | eval TIMEOUT_PERCENT=round(SFT_QUERIES*100/COUNT_SUM, 6) | rename NAME as "Domain Name", SFT_QUERIES as "count", TIMEOUT_PERCENT as "percent" | fields "Domain Name", "count", "percent"| head 10 $time.earliest$ $time.latest$
Requests resolved after disabling EDNS index=ib_syslog "success resolving" "after disabling EDNS" host="$grid_member_var$" | timechart count $time.earliest$ $time.latest$ Top 10 domains resolved after disabling EDNS index=ib_syslog "success resolving" "after disabling EDNS" host="$grid_member_var$" | top 10 fqdn $time.earliest$ $time.latest$
Requests resolved after reducing EDNS to 512 index=ib_syslog "success resolving" "after reducing the advertised EDNS UDP packet size to 512 octets" host="$grid_member_var$" | timechart count $time.earliest$ $time.latest$ Top 10 domains resolved after reducing EDNS to 512 index=ib_syslog "success resolving" "after reducing the advertised EDNS UDP packet size to 512 octets" host="$grid_member_var$" | top 10 fqdn $time.earliest$ $time.latest$
LAME delegations index=ib_syslog "lame server resolving" host="$grid_member_var$"| timechart count $time.earliest$ $time.latest$ Top 10 LAME delegations domains index=ib_syslog "lame server resolving" host="$grid_member_var$" | top 10 fqdn $time.earliest$ $time.latest$
Unexpected REFUSED return code index=ib_syslog "REFUSED unexpected RCODE resolving" host="$grid_member_var$"| timechart count $time.earliest$ $time.latest$ Top 10 Unexpected REFUSED return code domains index=ib_syslog "REFUSED unexpected RCODE resolving" host="$grid_member_var$" | top 10 fqdn $time.earliest$ $time.latest$
Unexpected SERVFAIL return code index=ib_syslog "SERVFAIL unexpected RCODE resolving" host="$grid_member_var$"| timechart count $time.earliest$ $time.latest$ Top 10 Unexpected SERVFAIL return code domains index=ib_syslog "SERVFAIL unexpected RCODE resolving" host="$grid_member_var$" | top 10 fqdn $time.earliest$ $time.latest$
Unexpected FORMERR return code index=ib_syslog "FORMERR resolving" host="$grid_member_var$"| timechart count $time.earliest$ $time.latest$ Top 10 Unexpected FORMERR return code domains index=ib_syslog "FORMERR resolving" host="$grid_member_var$" | top 10 fqdn $time.earliest$ $time.latest$
Security related indicators Fetches per server events index=ib_syslog "adb: quota " host="$grid_member_var$"| timechart count $time.earliest$ $time.latest$ Top 10 fetches per server IPs index=ib_syslog "adb: quota " host="$grid_member_var$" | top 10 fetches_server_ip $time.earliest$ $time.latest$
Fetches per zone events index=ib_syslog "too many simultaneous fetches for " host="$grid_member_var$"| timechart count $time.earliest$ $time.latest$ Top 10 fetches per zone FQDNs index=ib_syslog "too many simultaneous fetches for " host="$grid_member_var$" | top 10 fetches_zone_name $time.earliest$ $time.latest$