quickjs-tart

quickjs-based runtime for wallet-core logic
Log | Files | Refs | README | LICENSE

ares_metrics.c (9442B)


      1 /* MIT License
      2  *
      3  * Copyright (c) 2024 Brad House
      4  *
      5  * Permission is hereby granted, free of charge, to any person obtaining a copy
      6  * of this software and associated documentation files (the "Software"), to deal
      7  * in the Software without restriction, including without limitation the rights
      8  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
      9  * copies of the Software, and to permit persons to whom the Software is
     10  * furnished to do so, subject to the following conditions:
     11  *
     12  * The above copyright notice and this permission notice (including the next
     13  * paragraph) shall be included in all copies or substantial portions of the
     14  * Software.
     15  *
     16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     22  * SOFTWARE.
     23  *
     24  * SPDX-License-Identifier: MIT
     25  */
     26 
     27 
     28 /* IMPLEMENTATION NOTES
     29  * ====================
     30  *
     31  * With very little effort we should be able to determine fairly proper timeouts
     32  * we can use based on prior query history.  We track in order to be able to
     33  * auto-scale when network conditions change (e.g. maybe there is a provider
     34  * failover and timings change due to that).  Apple appears to do this within
     35  * their system resolver in MacOS.  Obviously we should have a minimum, maximum,
     36  * and initial value to make sure the algorithm doesn't somehow go off the
     37  * rails.
     38  *
     39  * Values:
     40  * - Minimum Timeout: 250ms (approximate RTT half-way around the globe)
     41  * - Maximum Timeout: 5000ms (Recommended timeout in RFC 1123), can be reduced
     42  *   by ARES_OPT_MAXTIMEOUTMS, but otherwise the bound specified by the option
     43  *   caps the retry timeout.
     44  * - Initial Timeout: User-specified via configuration or ARES_OPT_TIMEOUTMS
     45  * - Average latency multiplier: 5x (a local DNS server returning a cached value
     46  *   will be quicker than if it needs to recurse so we need to account for this)
     47  * - Minimum Count for Average: 3.  This is the minimum number of queries we
     48  *   need to form an average for the bucket.
     49  *
     50  * Per-server buckets for tracking latency over time (these are ephemeral
     51  * meaning they don't persist once a channel is destroyed).  We record both the
     52  * current timespan for the bucket and the immediate preceding timespan in case
     53  * of roll-overs we can still maintain recent metrics for calculations:
     54  * - 1 minute
     55  * - 15 minutes
     56  * - 1 hr
     57  * - 1 day
     58  * - since inception
     59  *
     60  * Each bucket would contain:
     61  * - timestamp (divided by interval)
     62  * - minimum latency
     63  * - maximum latency
     64  * - total time
     65  * - count
     66  * NOTE: average latency is (total time / count), we will calculate this
     67  *       dynamically when needed
     68  *
     69  * Basic algorithm for calculating timeout to use would be:
     70  * - Scan from most recent bucket to least recent
     71  * - Check timestamp of bucket, if doesn't match current time, continue to next
     72  *   bucket
     73  * - Check count of bucket, if its not at least the "Minimum Count for Average",
     74  *   check the previous bucket, otherwise continue to next bucket
     75  * - If we reached the end with no bucket match, use "Initial Timeout"
     76  * - If bucket is selected, take ("total time" / count) as Average latency,
     77  *   multiply by "Average Latency Multiplier", bound by "Minimum Timeout" and
     78  *   "Maximum Timeout"
     79  * NOTE: The timeout calculated may not be the timeout used.  If we are retrying
     80  * the query on the same server another time, then it will use a larger value
     81  *
     82  * On each query reply where the response is legitimate (proper response or
     83  * NXDOMAIN) and not something like a server error:
     84  * - Cycle through each bucket in order
     85  * - Check timestamp of bucket against current timestamp, if out of date
     86  *   overwrite previous entry with values, clear current values
     87  * - Compare current minimum and maximum recorded latency against query time and
     88  *   adjust if necessary
     89  * - Increment "count" by 1 and "total time" by the query time
     90  *
     91  * Other Notes:
     92  * - This is always-on, the only user-configurable value is the initial
     93  *   timeout which will simply re-uses the current option.
     94  * - Minimum and Maximum latencies for a bucket are currently unused but are
     95  *   there in case we find a need for them in the future.
     96  */
     97 
     98 #include "ares_private.h"
     99 
    100 /*! Minimum timeout value. Chosen due to it being approximately RTT half-way
    101  *  around the world */
    102 #define MIN_TIMEOUT_MS 250
    103 
    104 /*! Multiplier to apply to average latency to come up with an initial timeout */
    105 #define AVG_TIMEOUT_MULTIPLIER 5
    106 
    107 /*! Upper timeout bounds, only used if channel->maxtimeout not set */
    108 #define MAX_TIMEOUT_MS 5000
    109 
    110 /*! Minimum queries required to form an average */
    111 #define MIN_COUNT_FOR_AVERAGE 3
    112 
    113 static time_t ares_metric_timestamp(ares_server_bucket_t  bucket,
    114                                     const ares_timeval_t *now,
    115                                     ares_bool_t           is_previous)
    116 {
    117   time_t divisor = 1; /* Silence bogus MSVC warning by setting default value */
    118 
    119   switch (bucket) {
    120     case ARES_METRIC_1MINUTE:
    121       divisor = 60;
    122       break;
    123     case ARES_METRIC_15MINUTES:
    124       divisor = 15 * 60;
    125       break;
    126     case ARES_METRIC_1HOUR:
    127       divisor = 60 * 60;
    128       break;
    129     case ARES_METRIC_1DAY:
    130       divisor = 24 * 60 * 60;
    131       break;
    132     case ARES_METRIC_INCEPTION:
    133       return is_previous ? 0 : 1;
    134     case ARES_METRIC_COUNT:
    135       return 0; /* Invalid! */
    136   }
    137 
    138   if (is_previous) {
    139     if (divisor >= now->sec) {
    140       return 0;
    141     }
    142     return (time_t)((now->sec - divisor) / divisor);
    143   }
    144 
    145   return (time_t)(now->sec / divisor);
    146 }
    147 
    148 void ares_metrics_record(const ares_query_t *query, ares_server_t *server,
    149                          ares_status_t status, const ares_dns_record_t *dnsrec)
    150 {
    151   ares_timeval_t       now;
    152   ares_timeval_t       tvdiff;
    153   unsigned int         query_ms;
    154   ares_dns_rcode_t     rcode;
    155   ares_server_bucket_t i;
    156 
    157   if (status != ARES_SUCCESS) {
    158     return;
    159   }
    160 
    161   if (server == NULL) {
    162     return;
    163   }
    164 
    165   ares_tvnow(&now);
    166 
    167   rcode = ares_dns_record_get_rcode(dnsrec);
    168   if (rcode != ARES_RCODE_NOERROR && rcode != ARES_RCODE_NXDOMAIN) {
    169     return;
    170   }
    171 
    172   ares_timeval_diff(&tvdiff, &query->ts, &now);
    173   query_ms = (unsigned int)((tvdiff.sec * 1000) + (tvdiff.usec / 1000));
    174   if (query_ms == 0) {
    175     query_ms = 1;
    176   }
    177 
    178   /* Place in each bucket */
    179   for (i = 0; i < ARES_METRIC_COUNT; i++) {
    180     time_t ts = ares_metric_timestamp(i, &now, ARES_FALSE);
    181 
    182     /* Copy metrics to prev and clear */
    183     if (ts != server->metrics[i].ts) {
    184       server->metrics[i].prev_ts          = server->metrics[i].ts;
    185       server->metrics[i].prev_total_ms    = server->metrics[i].total_ms;
    186       server->metrics[i].prev_total_count = server->metrics[i].total_count;
    187       server->metrics[i].ts               = ts;
    188       server->metrics[i].latency_min_ms   = 0;
    189       server->metrics[i].latency_max_ms   = 0;
    190       server->metrics[i].total_ms         = 0;
    191       server->metrics[i].total_count      = 0;
    192     }
    193 
    194     if (server->metrics[i].latency_min_ms == 0 ||
    195         server->metrics[i].latency_min_ms > query_ms) {
    196       server->metrics[i].latency_min_ms = query_ms;
    197     }
    198 
    199     if (query_ms > server->metrics[i].latency_max_ms) {
    200       server->metrics[i].latency_max_ms = query_ms;
    201     }
    202 
    203     server->metrics[i].total_count++;
    204     server->metrics[i].total_ms += (ares_uint64_t)query_ms;
    205   }
    206 }
    207 
    208 size_t ares_metrics_server_timeout(const ares_server_t  *server,
    209                                    const ares_timeval_t *now)
    210 {
    211   const ares_channel_t *channel = server->channel;
    212   ares_server_bucket_t  i;
    213   size_t                timeout_ms = 0;
    214   size_t                max_timeout_ms;
    215 
    216   for (i = 0; i < ARES_METRIC_COUNT; i++) {
    217     time_t ts = ares_metric_timestamp(i, now, ARES_FALSE);
    218 
    219     /* This ts has been invalidated, see if we should use the previous
    220      * time period */
    221     if (ts != server->metrics[i].ts ||
    222         server->metrics[i].total_count < MIN_COUNT_FOR_AVERAGE) {
    223       time_t prev_ts = ares_metric_timestamp(i, now, ARES_TRUE);
    224       if (prev_ts != server->metrics[i].prev_ts ||
    225           server->metrics[i].prev_total_count < MIN_COUNT_FOR_AVERAGE) {
    226         /* Move onto next bucket */
    227         continue;
    228       }
    229       /* Calculate average time for previous bucket */
    230       timeout_ms = (size_t)(server->metrics[i].prev_total_ms /
    231                             server->metrics[i].prev_total_count);
    232     } else {
    233       /* Calculate average time for current bucket*/
    234       timeout_ms =
    235         (size_t)(server->metrics[i].total_ms / server->metrics[i].total_count);
    236     }
    237 
    238     /* Multiply average by constant to get timeout value */
    239     timeout_ms *= AVG_TIMEOUT_MULTIPLIER;
    240     break;
    241   }
    242 
    243   /* If we're here, that means its the first query for the server, so we just
    244    * use the initial default timeout */
    245   if (timeout_ms == 0) {
    246     timeout_ms = channel->timeout;
    247   }
    248 
    249   /* don't go below lower bounds */
    250   if (timeout_ms < MIN_TIMEOUT_MS) {
    251     timeout_ms = MIN_TIMEOUT_MS;
    252   }
    253 
    254   /* don't go above upper bounds */
    255   max_timeout_ms = channel->maxtimeout ? channel->maxtimeout : MAX_TIMEOUT_MS;
    256   if (timeout_ms > max_timeout_ms) {
    257     timeout_ms = max_timeout_ms;
    258   }
    259 
    260   return timeout_ms;
    261 }