/* Update ack counter until next congestion signal event is allowed */ if (cc->next_cc_event) cc->next_cc_event--;
/* Update ack counter until a full cwnd is processed */ if (cc->next_cwnd_event) cc->next_cwnd_event--;
/* Compute BDP and RTT. If we did not update, don't run the alg */ if (!congestion_control_update_circuit_estimates(cc, circ)) { cc->inflight = cc->inflight - cc->sendme_inc; return0; }
/* The queue use is the amount in which our cwnd is above BDP; * if it is below, then 0 queue use. */ if (vegas_bdp(cc) > cc->cwnd) queue_use = 0; // This should not happen anymore.. else queue_use = cc->cwnd - vegas_bdp(cc);
/* Update the full state */ if (cwnd_became_full(cc)) cc->cwnd_full = 1; elseif (cwnd_became_nonfull(cc)) cc->cwnd_full = 0;
/** * Upon receipt of a SENDME, pop the oldest timestamp off the timestamp * list, and use this to update RTT. * * Returns true if circuit estimates were successfully updated, false * otherwise. */ bool congestion_control_update_circuit_estimates(congestion_control_t *cc, constcircuit_t *circ) { uint64_t now_usec = monotime_absolute_usec();
/** * Called when we get a SENDME. Updates circuit RTT by pulling off a * timestamp of when we sent the CIRCWINDOW_INCREMENT-th cell from * the queue of such timestamps, and comparing that to current time. * * Also updates min, max, and EWMA of RTT. * * Returns the current circuit RTT in usecs, or 0 if it could not be * measured (due to clock jump, stall, etc). */ STATIC uint64_t congestion_control_update_circuit_rtt(congestion_control_t *cc, uint64_t now_usec) { uint64_t rtt, ewma_cnt; uint64_t sent_at_timestamp;
tor_assert(cc);
/* Get the time that we sent the cell that resulted in the other * end sending this sendme. Use this to calculate RTT */ sent_at_timestamp = dequeue_timestamp(cc->sendme_pending_timestamps);
rtt = now_usec - sent_at_timestamp;
/* Do not update RTT at all if it looks fishy */ if (time_delta_stalled_or_jumped(cc, cc->ewma_rtt_usec, rtt)) { num_clock_stalls++; /* Accounting */ return0; }
if (rtt > cc->max_rtt_usec) { cc->max_rtt_usec = rtt; }
if (cc->min_rtt_usec == 0) { // If we do not have a min_rtt yet, use current ewma cc->min_rtt_usec = cc->ewma_rtt_usec; } elseif (cc->cwnd == cc->cwnd_min && !cc->in_slow_start) { // Raise min rtt if cwnd hit cwnd_min. This gets us out of a wedge state // if we hit cwnd_min due to an abnormally low rtt. uint64_t new_rtt = percent_max_mix(cc->ewma_rtt_usec, cc->min_rtt_usec, rtt_reset_pct);
staticratelim_t rtt_notice_limit = RATELIM_INIT(300); log_fn_ratelim(&rtt_notice_limit, LOG_NOTICE, LD_CIRC, "Resetting circ RTT from %"PRIu64" to %"PRIu64" due to low cwnd", cc->min_rtt_usec/1000, new_rtt/1000);
cc->min_rtt_usec = new_rtt; num_rtt_reset++; /* Accounting */ } elseif (cc->ewma_rtt_usec < cc->min_rtt_usec) { // Using the EWMA for min instead of current RTT helps average out // effects from other conns cc->min_rtt_usec = cc->ewma_rtt_usec; }
return rtt; }
/** * Called when we get a SENDME. Updates the bandwidth-delay-product (BDP) * estimates of a circuit. Several methods of computing BDP are used, * depending on scenario. While some congestion control algorithms only * use one of these methods, we update them all because it's quick and easy. * * - now_usec is the current monotime in usecs. * - curr_rtt_usec is the current circuit RTT in usecs. It may be 0 if no * RTT could bemeasured. * * Returns true if we were able to update BDP, false otherwise. */ staticbool congestion_control_update_circuit_bdp(congestion_control_t *cc, constcircuit_t *circ, uint64_t curr_rtt_usec) { int chan_q = 0; unsignedint blocked_on_chan = 0;
tor_assert(cc);
if (CIRCUIT_IS_ORIGIN(circ)) { /* origin circs use n_chan */ chan_q = circ->n_chan_cells.n; blocked_on_chan = circ->circuit_blocked_on_n_chan; } else { /* Both onion services and exits use or_circuit and p_chan */ chan_q = CONST_TO_OR_CIRCUIT(circ)->p_chan_cells.n; blocked_on_chan = circ->circuit_blocked_on_p_chan; }
/* If we have no EWMA RTT, it is because monotime has been stalled * or messed up the entire time so far. Set our BDP estimates directly * to current cwnd */ if (!cc->ewma_rtt_usec) { uint64_t cwnd = cc->cwnd;
tor_assert_nonfatal(cc->cwnd <= cwnd_max);
/* If the channel is blocked, keep subtracting off the chan_q * until we hit the min cwnd. */ if (blocked_on_chan) { /* Cast is fine because we're less than int32 */ if (chan_q >= (int64_t)cwnd) { log_notice(LD_CIRC, "Clock stall with large chanq: %d %"PRIu64, chan_q, cwnd); cwnd = cc->cwnd_min; } else { cwnd = MAX(cwnd - chan_q, cc->cwnd_min); } cc->blocked_chan = 1; } else { cc->blocked_chan = 0; }
cc->bdp = cwnd;
staticratelim_t dec_notice_limit = RATELIM_INIT(300); log_fn_ratelim(&dec_notice_limit, LOG_NOTICE, LD_CIRC, "Our clock has been stalled for the entire lifetime of a circuit. " "Performance may be sub-optimal.");
return blocked_on_chan; }
/* Congestion window based BDP will respond to changes in RTT only, and is * relative to cwnd growth. It is useful for correcting for BDP * overestimation, but if BDP is higher than the current cwnd, it will * underestimate it. * * We multiply here first to avoid precision issues from min_RTT being * close to ewma RTT. Since all fields are u64, there is plenty of * room here to multiply first. */ cc->bdp = cc->cwnd*cc->min_rtt_usec/cc->ewma_rtt_usec;
/* The orconn is blocked; use smaller of inflight vs SENDME */ if (blocked_on_chan) { log_info(LD_CIRC, "CC: Streams blocked on circ channel. Chanq: %d", chan_q);
/* A blocked channel is an immediate congestion signal, but it still * happens only once per cwnd */ if (!cc->blocked_chan) { cc->next_cc_event = 0; cc->blocked_chan = 1; } } else { /* If we were previously blocked, emit a new congestion event * now that we are unblocked, to re-evaluate cwnd */ if (cc->blocked_chan) { cc->blocked_chan = 0; cc->next_cc_event = 0; log_info(LD_CIRC, "CC: Streams un-blocked on circ channel. Chanq: %d", chan_q); } }
/* We updated BDP this round if either we had a blocked channel, or * the curr_rtt_usec was not 0. */ bool ret = (blocked_on_chan || curr_rtt_usec != 0); if (ret) { tor_trace(TR_SUBSYS(cc), TR_EV(bdp_update), circ, cc, curr_rtt_usec); } return ret; }
if (cc->in_slow_start) { if (queue_use < cc->vegas_params.gamma && !cc->blocked_chan) { /* If the congestion window is not fully in use, skip any * increment of cwnd in slow start */ if (cc->cwnd_full) { /* Get the "Limited Slow Start" increment */ uint64_t inc = rfc3742_ss_inc(cc); cc->cwnd += inc;
// Check if inc is less than what we would do in steady-state // avoidance. Note that this is likely never to happen // in practice, but we keep this block and the metrics to make // sure. if (inc*SENDME_PER_CWND(cc) <= CWND_INC(cc)*cc->cwnd_inc_rate) { congestion_control_vegas_exit_slow_start(circ, cc);
cc_stats_vegas_below_ss_inc_floor++;
/* We exited slow start without being blocked */ cc_stats_vegas_ss_csig_blocked_ma = stats_update_running_avg(cc_stats_vegas_ss_csig_blocked_ma, 0); } } } else { uint64_t old_cwnd = cc->cwnd;
/* Congestion signal: Set cwnd to gamma threshold */ cc->cwnd = vegas_bdp(cc) + cc->vegas_params.gamma;
/* Compute the percentage we experience a blocked csig vs RTT sig */ if (cc->blocked_chan) { cc_stats_vegas_ss_csig_blocked_ma = stats_update_running_avg(cc_stats_vegas_ss_csig_blocked_ma, 100); } else { uint64_t cwnd_diff = (old_cwnd > cc->cwnd ? old_cwnd - cc->cwnd : 0);
/* Account the amount we reduced the cwnd by for the gamma cutoff */ cc_stats_vegas_gamma_drop_ma = stats_update_running_avg(cc_stats_vegas_gamma_drop_ma, cwnd_diff); }
/* If we are above the delta threshold, drop cwnd down to the * delta threshold. */ cc->cwnd = vegas_bdp(cc) + cc->vegas_params.delta - CWND_INC(cc);
/* Account the amount we reduced the cwnd by for the gamma cutoff */ cwnd_diff = (old_cwnd > cc->cwnd ? old_cwnd - cc->cwnd : 0); cc_stats_vegas_delta_drop_ma = stats_update_running_avg(cc_stats_vegas_delta_drop_ma, cwnd_diff);
/* Compute the percentage we experience a blocked csig vs RTT sig */ if (cc->blocked_chan) { cc_stats_vegas_csig_blocked_ma = stats_update_running_avg(cc_stats_vegas_csig_blocked_ma, 100); } else { cc_stats_vegas_csig_blocked_ma = stats_update_running_avg(cc_stats_vegas_csig_blocked_ma, 0); }
/* Log if we're above the ss_cap */ if (cc->cwnd >= cc->vegas_params.ss_cwnd_max) { log_info(LD_CIRC, "CC: TOR_VEGAS above ss_max in steady state for circ %d: %"PRIu64, circ->purpose, cc->cwnd); } }
/** * Returns the number of sendme acks we will receive before we update cwnd. * * Congestion control literature recommends only one update of cwnd per * cwnd worth of acks. However, we can also tune this to be more frequent * by increasing the 'cc_cwnd_inc_rate' consensus parameter. This tuning * only applies after slow start. * * If this returns 0 due to high cwnd_inc_rate, the calling code will * update every sendme ack. */ staticinlineuint64_tCWND_UPDATE_RATE(conststructcongestion_control_t *cc) { /* We add cwnd_inc_rate*sendme_inc/2 to round to nearest integer number * of acks */
/** * Gives us the number of SENDMEs in a CWND, rounded. */ staticinlineuint64_tSENDME_PER_CWND(conststructcongestion_control_t *cc) { /* We add cwnd_inc_rate*sendme_inc/2 to round to nearest integer number * of acks */ return ((cc->cwnd + cc->sendme_inc/2)/cc->sendme_inc); }
/** * Process a SENDME and update the congestion window according to the * rules specified in TOR_VEGAS of Proposal #324. * * Essentially, this algorithm attempts to measure queue lengths on * the circuit by subtracting the bandwidth-delay-product estimate * from the current congestion window. * * If the congestion window is larger than the bandwidth-delay-product, * then data is assumed to be queuing. We reduce the congestion window * in that case. * * If the congestion window is smaller than the bandwidth-delay-product, * then there is spare bandwidth capacity on the circuit. We increase the * congestion window in that case. * * The congestion window is updated only once every congestion window worth of * packets, even if the signal persists. It is also updated whenever the * upstream orcon blocks, or unblocks. This minimizes local client queues. */
/** * Upon receipt of a SENDME, pop the oldest timestamp off the timestamp * list, and use this to update RTT. * * Returns true if circuit estimates were successfully updated, false * otherwise. */ bool congestion_control_update_circuit_estimates(congestion_control_t *cc, constcircuit_t *circ) { uint64_t now_usec = monotime_absolute_usec();
/** * Called when we get a SENDME. Updates circuit RTT by pulling off a * timestamp of when we sent the CIRCWINDOW_INCREMENT-th cell from * the queue of such timestamps, and comparing that to current time. * * Also updates min, max, and EWMA of RTT. * * Returns the current circuit RTT in usecs, or 0 if it could not be * measured (due to clock jump, stall, etc). */ STATIC uint64_t congestion_control_update_circuit_rtt(congestion_control_t *cc, uint64_t now_usec) { uint64_t rtt, ewma_cnt; uint64_t sent_at_timestamp;
tor_assert(cc);
/* Get the time that we sent the cell that resulted in the other * end sending this sendme. Use this to calculate RTT */ sent_at_timestamp = dequeue_timestamp(cc->sendme_pending_timestamps);
rtt = now_usec - sent_at_timestamp;
/* Do not update RTT at all if it looks fishy */ if (time_delta_stalled_or_jumped(cc, cc->ewma_rtt_usec, rtt)) { num_clock_stalls++; /* Accounting */ return0; }
if (rtt > cc->max_rtt_usec) { cc->max_rtt_usec = rtt; }
if (cc->min_rtt_usec == 0) { // If we do not have a min_rtt yet, use current ewma cc->min_rtt_usec = cc->ewma_rtt_usec; } elseif (cc->cwnd == cc->cwnd_min && !cc->in_slow_start) { // Raise min rtt if cwnd hit cwnd_min. This gets us out of a wedge state // if we hit cwnd_min due to an abnormally low rtt. uint64_t new_rtt = percent_max_mix(cc->ewma_rtt_usec, cc->min_rtt_usec, rtt_reset_pct);
staticratelim_t rtt_notice_limit = RATELIM_INIT(300); log_fn_ratelim(&rtt_notice_limit, LOG_NOTICE, LD_CIRC, "Resetting circ RTT from %"PRIu64" to %"PRIu64" due to low cwnd", cc->min_rtt_usec/1000, new_rtt/1000);
cc->min_rtt_usec = new_rtt; num_rtt_reset++; /* Accounting */ } elseif (cc->ewma_rtt_usec < cc->min_rtt_usec) { // Using the EWMA for min instead of current RTT helps average out // effects from other conns cc->min_rtt_usec = cc->ewma_rtt_usec; }
/** * Called when we get a SENDME. Updates the bandwidth-delay-product (BDP) * estimates of a circuit. Several methods of computing BDP are used, * depending on scenario. While some congestion control algorithms only * use one of these methods, we update them all because it's quick and easy. * * - now_usec is the current monotime in usecs. * - curr_rtt_usec is the current circuit RTT in usecs. It may be 0 if no * RTT could bemeasured. * * Returns true if we were able to update BDP, false otherwise. */ staticbool congestion_control_update_circuit_bdp(congestion_control_t *cc, constcircuit_t *circ, uint64_t curr_rtt_usec) { int chan_q = 0; unsignedint blocked_on_chan = 0;
tor_assert(cc);
if (CIRCUIT_IS_ORIGIN(circ)) { /* origin circs use n_chan */ chan_q = circ->n_chan_cells.n; blocked_on_chan = circ->circuit_blocked_on_n_chan; } else { /* Both onion services and exits use or_circuit and p_chan */ chan_q = CONST_TO_OR_CIRCUIT(circ)->p_chan_cells.n; blocked_on_chan = circ->circuit_blocked_on_p_chan; }
/* If we have no EWMA RTT, it is because monotime has been stalled * or messed up the entire time so far. Set our BDP estimates directly * to current cwnd */ if (!cc->ewma_rtt_usec) { uint64_t cwnd = cc->cwnd;
tor_assert_nonfatal(cc->cwnd <= cwnd_max);
/* If the channel is blocked, keep subtracting off the chan_q * until we hit the min cwnd. */ if (blocked_on_chan) { /* Cast is fine because we're less than int32 */ if (chan_q >= (int64_t)cwnd) { log_notice(LD_CIRC, "Clock stall with large chanq: %d %"PRIu64, chan_q, cwnd); cwnd = cc->cwnd_min; } else { cwnd = MAX(cwnd - chan_q, cc->cwnd_min); } cc->blocked_chan = 1; } else { cc->blocked_chan = 0; }
cc->bdp = cwnd;
staticratelim_t dec_notice_limit = RATELIM_INIT(300); log_fn_ratelim(&dec_notice_limit, LOG_NOTICE, LD_CIRC, "Our clock has been stalled for the entire lifetime of a circuit. " "Performance may be sub-optimal.");
return blocked_on_chan; }
4. 计算队列长度
1 2 3 4
if (vegas_bdp(cc) > cc->cwnd) queue_use = 0; // This should not happen anymore.. else queue_use = cc->cwnd - vegas_bdp(cc);
structvegas_params_t { /** The slow-start cwnd cap for RFC3742 */ uint32_t ss_cwnd_cap; /** The maximum slow-start cwnd */ uint32_t ss_cwnd_max; /** The queue use allowed before we exit slow start */ uint16_t gamma; /** The queue use below which we increment cwnd */ uint16_t alpha; /** The queue use above which we decrement cwnd */ uint16_t beta; /** The queue use at which we cap cwnd in steady state */ uint16_t delta; /** Weighted average (percent) between cwnd estimator and * piecewise estimator. */ uint8_t bdp_mix_pct; };
if (!new_circ && relay_command != RELAY_COMMAND_DATA) { if (!cfx->curr_leg) { log_warn(LD_BUG, "No current leg for conflux with relay command %d", relay_command); returnNULL; } return cfx->curr_leg->circ; }