Lines Matching +full:low +full:- +full:cost
1 /* SPDX-License-Identifier: GPL-2.0
3 * IO cost model based controller.
10 * observable cost metric. This is distinguished from CPU and memory where
17 * useless for the purpose of IO capacity distribution. While on-device
19 * non-queued rotational devices, this is no longer viable with modern
22 * While there is no cost metric we can trivially observe, it isn't a
23 * complete mystery. For example, on a rotational device, seek cost
27 * implement a reasonable work-conserving proportional IO resource
30 * 1. IO Cost Model
32 * IO cost model estimates the cost of an IO given its basic parameters and
33 * history (e.g. the end sector of the last IO). The cost is measured in
34 * device time. If a given IO is estimated to cost 10ms, the device should
37 * Currently, there's only one builtin cost model - linear. Each IO is
38 * classified as sequential or random and given a base cost accordingly.
39 * On top of that, a size cost proportional to the length of the IO is
44 * /sys/fs/cgroup/io.cost.model.
47 * device-specific coefficients.
54 * 2-1. Vtime Distribution
75 * against the device vtime - an IO which takes 10ms on the underlying
84 * 2-2. Vrate Adjustment
86 * It's unrealistic to expect the cost model to be perfect. There are too
97 * To slow down, we lower the vrate - the rate at which the device vtime
100 * 750ms worth of IOs per second, and vice-versa for speeding up.
102 * Device business is determined using two criteria - rq wait and
105 * When a device gets saturated, the on-device and then the request queues
121 * service. There is an inherent trade-off - the tighter the latency QoS,
123 * and can be set through /sys/fs/cgroup/io.cost.qos.
125 * 2-3. Work Conservation
130 * cost per second, i.e., 10% of the device capacity. The naive
133 * compared to free-for-all competition. This is too high a cost to pay
156 * controller uses a drgn based monitoring script -
165 * - per : Timer period
166 * - cur_per : Internal wall and device vtime clock
167 * - vrate : Device virtual time rate against wall clock
168 * - weight : Surplus-adjusted and configured weights
169 * - hweight : Surplus-adjusted and configured hierarchical weights
170 * - inflt : The percentage of in-flight IO cost at the end of last period
171 * - del_ms : Deferred issuer delay induction level and duration
172 * - usages : Usage history
183 #include "blk-rq-qos.h"
184 #include "blk-stat.h"
185 #include "blk-wbt.h"
186 #include "blk-cgroup.h"
190 /* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
200 cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \
220 * iocg->vtime is targeted at 50% behind the device vtime, which
239 * As vtime is used to calculate the cost of each IO, it needs to
241 * represent the cost of a single page worth of discard with
246 * 1s worth of vtime is 2^37. This gives us both sub-nanosecond
247 * granularity and days of wrap-around time even at extreme vrates.
273 * The effect of delay is indirect and non-linear and a huge amount of
288 * cache, the kernel doesn't have well-defined back-pressure propagation
306 * size-proportional components of cost calculation in closer
307 * numbers of digits to per-IO cost components.
311 IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT,
323 /* io.cost.qos controls including per-dev enable of the whole controller */
330 /* io.cost.qos params */
341 /* io.cost.model controls */
348 /* builtin linear cost model coefficients */
387 s64 low; member
461 /* per device-cgroup pair */
467 * A iocg can get its weight from two sources - an explicit
468 * per-device-cgroup configuration or the default weight of the
469 * cgroup. `cfg_weight` is the explicit per-device-cgroup
501 * than issue. The delta behind `vtime` represents the cost of
502 * currently in-flight IOs.
648 * vrate adjust percentages indexed by ioc->busy_level. We adjust up on
672 struct gendisk *disk = ioc->rqos.disk; in ioc_name()
676 return disk->disk_name; in ioc_name()
691 return pd_to_blkg(&iocg->pd); in iocg_to_blkg()
712 static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse) in cost_to_abs_cost() argument
714 return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE); in cost_to_abs_cost()
718 u64 abs_cost, u64 cost) in iocg_commit_bio() argument
722 bio->bi_iocost_cost = cost; in iocg_commit_bio()
723 atomic64_add(cost, &iocg->vtime); in iocg_commit_bio()
725 gcs = get_cpu_ptr(iocg->pcpu_stat); in iocg_commit_bio()
726 local64_add(abs_cost, &gcs->abs_vusage); in iocg_commit_bio()
733 spin_lock_irqsave(&iocg->ioc->lock, *flags); in iocg_lock()
734 spin_lock(&iocg->waitq.lock); in iocg_lock()
736 spin_lock_irqsave(&iocg->waitq.lock, *flags); in iocg_lock()
743 spin_unlock(&iocg->waitq.lock); in iocg_unlock()
744 spin_unlock_irqrestore(&iocg->ioc->lock, *flags); in iocg_unlock()
746 spin_unlock_irqrestore(&iocg->waitq.lock, *flags); in iocg_unlock()
755 struct ioc_margins *margins = &ioc->margins; in ioc_refresh_margins()
756 u32 period_us = ioc->period_us; in ioc_refresh_margins()
757 u64 vrate = ioc->vtime_base_rate; in ioc_refresh_margins()
759 margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate; in ioc_refresh_margins()
760 margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate; in ioc_refresh_margins()
761 margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate; in ioc_refresh_margins()
769 lockdep_assert_held(&ioc->lock); in ioc_refresh_period_us()
772 if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) { in ioc_refresh_period_us()
773 ppm = ioc->params.qos[QOS_RPPM]; in ioc_refresh_period_us()
774 lat = ioc->params.qos[QOS_RLAT]; in ioc_refresh_period_us()
776 ppm = ioc->params.qos[QOS_WPPM]; in ioc_refresh_period_us()
777 lat = ioc->params.qos[QOS_WLAT]; in ioc_refresh_period_us()
789 multi = max_t(u32, (MILLION - ppm) / 50000, 2); in ioc_refresh_period_us()
796 ioc->period_us = period_us; in ioc_refresh_period_us()
797 ioc->timer_slack_ns = div64_u64( in ioc_refresh_period_us()
804 * ioc->rqos.disk isn't initialized when this function is called from
809 int idx = ioc->autop_idx; in ioc_autop_idx()
815 if (!blk_queue_nonrot(disk->queue)) in ioc_autop_idx()
819 if (blk_queue_depth(disk->queue) == 1) in ioc_autop_idx()
827 if (ioc->user_qos_params || ioc->user_cost_model) in ioc_autop_idx()
831 vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC); in ioc_autop_idx()
834 if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) { in ioc_autop_idx()
835 if (!ioc->autop_too_fast_at) in ioc_autop_idx()
836 ioc->autop_too_fast_at = now_ns; in ioc_autop_idx()
837 if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC) in ioc_autop_idx()
840 ioc->autop_too_fast_at = 0; in ioc_autop_idx()
843 if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) { in ioc_autop_idx()
844 if (!ioc->autop_too_slow_at) in ioc_autop_idx()
845 ioc->autop_too_slow_at = now_ns; in ioc_autop_idx()
846 if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC) in ioc_autop_idx()
847 return idx - 1; in ioc_autop_idx()
849 ioc->autop_too_slow_at = 0; in ioc_autop_idx()
862 * and calculate the linear model cost coefficients.
864 * *@page per-page cost 1s / (@bps / 4096)
865 * *@seqio base cost of a seq IO max((1s / @seqiops) - *@page, 0)
866 * @randiops base cost of a rand IO max((1s / @randiops) - *@page, 0)
887 *seqio = v - *page; in calc_lcoefs()
893 *randio = v - *page; in calc_lcoefs()
899 u64 *u = ioc->params.i_lcoefs; in ioc_refresh_lcoefs()
900 u64 *c = ioc->params.lcoefs; in ioc_refresh_lcoefs()
909 * struct gendisk is required as an argument because ioc->rqos.disk
918 lockdep_assert_held(&ioc->lock); in ioc_refresh_params_disk()
923 if (idx == ioc->autop_idx && !force) in ioc_refresh_params_disk()
926 if (idx != ioc->autop_idx) { in ioc_refresh_params_disk()
927 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); in ioc_refresh_params_disk()
928 ioc->vtime_base_rate = VTIME_PER_USEC; in ioc_refresh_params_disk()
931 ioc->autop_idx = idx; in ioc_refresh_params_disk()
932 ioc->autop_too_fast_at = 0; in ioc_refresh_params_disk()
933 ioc->autop_too_slow_at = 0; in ioc_refresh_params_disk()
935 if (!ioc->user_qos_params) in ioc_refresh_params_disk()
936 memcpy(ioc->params.qos, p->qos, sizeof(p->qos)); in ioc_refresh_params_disk()
937 if (!ioc->user_cost_model) in ioc_refresh_params_disk()
938 memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs)); in ioc_refresh_params_disk()
943 ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] * in ioc_refresh_params_disk()
945 ioc->vrate_max = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MAX] * in ioc_refresh_params_disk()
953 return ioc_refresh_params_disk(ioc, force, ioc->rqos.disk); in ioc_refresh_params()
965 s64 pleft = ioc->period_at + ioc->period_us - now->now; in ioc_refresh_vrate()
966 s64 vperiod = ioc->period_us * ioc->vtime_base_rate; in ioc_refresh_vrate()
969 lockdep_assert_held(&ioc->lock); in ioc_refresh_vrate()
980 vcomp = -div64_s64(ioc->vtime_err, pleft); in ioc_refresh_vrate()
981 vcomp_min = -(ioc->vtime_base_rate >> 1); in ioc_refresh_vrate()
982 vcomp_max = ioc->vtime_base_rate; in ioc_refresh_vrate()
985 ioc->vtime_err += vcomp * pleft; in ioc_refresh_vrate()
987 atomic64_set(&ioc->vtime_rate, ioc->vtime_base_rate + vcomp); in ioc_refresh_vrate()
990 ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod); in ioc_refresh_vrate()
997 u64 vrate = ioc->vtime_base_rate; in ioc_adjust_base_vrate()
998 u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max; in ioc_adjust_base_vrate()
1000 if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) { in ioc_adjust_base_vrate()
1001 if (ioc->busy_level != prev_busy_level || nr_lagging) in ioc_adjust_base_vrate()
1018 vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), 100); in ioc_adjust_base_vrate()
1021 int idx = min_t(int, abs(ioc->busy_level), in ioc_adjust_base_vrate()
1022 ARRAY_SIZE(vrate_adj_pct) - 1); in ioc_adjust_base_vrate()
1025 if (ioc->busy_level > 0) in ioc_adjust_base_vrate()
1026 adj_pct = 100 - adj_pct; in ioc_adjust_base_vrate()
1037 ioc->vtime_base_rate = vrate; in ioc_adjust_base_vrate()
1047 now->now_ns = blk_time_get_ns(); in ioc_now()
1048 now->now = ktime_to_us(now->now_ns); in ioc_now()
1049 vrate = atomic64_read(&ioc->vtime_rate); in ioc_now()
1060 seq = read_seqcount_begin(&ioc->period_seqcount); in ioc_now()
1061 now->vnow = ioc->period_at_vtime + in ioc_now()
1062 (now->now - ioc->period_at) * vrate; in ioc_now()
1063 } while (read_seqcount_retry(&ioc->period_seqcount, seq)); in ioc_now()
1068 WARN_ON_ONCE(ioc->running != IOC_RUNNING); in ioc_start_period()
1070 write_seqcount_begin(&ioc->period_seqcount); in ioc_start_period()
1071 ioc->period_at = now->now; in ioc_start_period()
1072 ioc->period_at_vtime = now->vnow; in ioc_start_period()
1073 write_seqcount_end(&ioc->period_seqcount); in ioc_start_period()
1075 ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us); in ioc_start_period()
1076 add_timer(&ioc->timer); in ioc_start_period()
1082 * is saved to be used as reference for later inuse in-period adjustments.
1087 struct ioc *ioc = iocg->ioc; in __propagate_weights()
1090 lockdep_assert_held(&ioc->lock); in __propagate_weights()
1097 if (list_empty(&iocg->active_list) && iocg->child_active_sum) { in __propagate_weights()
1098 inuse = DIV64_U64_ROUND_UP(active * iocg->child_inuse_sum, in __propagate_weights()
1099 iocg->child_active_sum); in __propagate_weights()
1104 iocg->last_inuse = iocg->inuse; in __propagate_weights()
1106 iocg->saved_margin = now->vnow - atomic64_read(&iocg->vtime); in __propagate_weights()
1108 if (active == iocg->active && inuse == iocg->inuse) in __propagate_weights()
1111 for (lvl = iocg->level - 1; lvl >= 0; lvl--) { in __propagate_weights()
1112 struct ioc_gq *parent = iocg->ancestors[lvl]; in __propagate_weights()
1113 struct ioc_gq *child = iocg->ancestors[lvl + 1]; in __propagate_weights()
1117 parent->child_active_sum += (s32)(active - child->active); in __propagate_weights()
1118 parent->child_inuse_sum += (s32)(inuse - child->inuse); in __propagate_weights()
1120 child->active = active; in __propagate_weights()
1121 child->inuse = inuse; in __propagate_weights()
1128 if (parent->child_active_sum) { in __propagate_weights()
1129 parent_active = parent->weight; in __propagate_weights()
1131 parent_active * parent->child_inuse_sum, in __propagate_weights()
1132 parent->child_active_sum); in __propagate_weights()
1136 if (parent_active == parent->active && in __propagate_weights()
1137 parent_inuse == parent->inuse) in __propagate_weights()
1144 ioc->weights_updated = true; in __propagate_weights()
1149 lockdep_assert_held(&ioc->lock); in commit_weights()
1151 if (ioc->weights_updated) { in commit_weights()
1154 atomic_inc(&ioc->hweight_gen); in commit_weights()
1155 ioc->weights_updated = false; in commit_weights()
1163 commit_weights(iocg->ioc); in propagate_weights()
1168 struct ioc *ioc = iocg->ioc; in current_hweight()
1173 /* hot path - if uptodate, use cached */ in current_hweight()
1174 ioc_gen = atomic_read(&ioc->hweight_gen); in current_hweight()
1175 if (ioc_gen == iocg->hweight_gen) in current_hweight()
1191 for (lvl = 0; lvl <= iocg->level - 1; lvl++) { in current_hweight()
1192 struct ioc_gq *parent = iocg->ancestors[lvl]; in current_hweight()
1193 struct ioc_gq *child = iocg->ancestors[lvl + 1]; in current_hweight()
1194 u64 active_sum = READ_ONCE(parent->child_active_sum); in current_hweight()
1195 u64 inuse_sum = READ_ONCE(parent->child_inuse_sum); in current_hweight()
1196 u32 active = READ_ONCE(child->active); in current_hweight()
1197 u32 inuse = READ_ONCE(child->inuse); in current_hweight()
1210 iocg->hweight_active = max_t(u32, hwa, 1); in current_hweight()
1211 iocg->hweight_inuse = max_t(u32, hwi, 1); in current_hweight()
1212 iocg->hweight_gen = ioc_gen; in current_hweight()
1215 *hw_activep = iocg->hweight_active; in current_hweight()
1217 *hw_inusep = iocg->hweight_inuse; in current_hweight()
1227 u32 inuse = iocg->active; in current_hweight_max()
1231 lockdep_assert_held(&iocg->ioc->lock); in current_hweight_max()
1233 for (lvl = iocg->level - 1; lvl >= 0; lvl--) { in current_hweight_max()
1234 struct ioc_gq *parent = iocg->ancestors[lvl]; in current_hweight_max()
1235 struct ioc_gq *child = iocg->ancestors[lvl + 1]; in current_hweight_max()
1237 child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse; in current_hweight_max()
1239 inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum, in current_hweight_max()
1240 parent->child_active_sum); in current_hweight_max()
1248 struct ioc *ioc = iocg->ioc; in weight_updated()
1250 struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg); in weight_updated()
1253 lockdep_assert_held(&ioc->lock); in weight_updated()
1255 weight = iocg->cfg_weight ?: iocc->dfl_weight; in weight_updated()
1256 if (weight != iocg->weight && iocg->active) in weight_updated()
1257 propagate_weights(iocg, weight, iocg->inuse, true, now); in weight_updated()
1258 iocg->weight = weight; in weight_updated()
1263 struct ioc *ioc = iocg->ioc; in iocg_activate()
1272 if (!list_empty(&iocg->active_list)) { in iocg_activate()
1274 cur_period = atomic64_read(&ioc->cur_period); in iocg_activate()
1275 if (atomic64_read(&iocg->active_period) != cur_period) in iocg_activate()
1276 atomic64_set(&iocg->active_period, cur_period); in iocg_activate()
1281 if (iocg->child_active_sum) in iocg_activate()
1284 spin_lock_irq(&ioc->lock); in iocg_activate()
1289 cur_period = atomic64_read(&ioc->cur_period); in iocg_activate()
1290 last_period = atomic64_read(&iocg->active_period); in iocg_activate()
1291 atomic64_set(&iocg->active_period, cur_period); in iocg_activate()
1293 /* already activated or breaking leaf-only constraint? */ in iocg_activate()
1294 if (!list_empty(&iocg->active_list)) in iocg_activate()
1296 for (i = iocg->level - 1; i > 0; i--) in iocg_activate()
1297 if (!list_empty(&iocg->ancestors[i]->active_list)) in iocg_activate()
1300 if (iocg->child_active_sum) in iocg_activate()
1307 vtarget = now->vnow - ioc->margins.target; in iocg_activate()
1308 vtime = atomic64_read(&iocg->vtime); in iocg_activate()
1310 atomic64_add(vtarget - vtime, &iocg->vtime); in iocg_activate()
1311 atomic64_add(vtarget - vtime, &iocg->done_vtime); in iocg_activate()
1319 iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1; in iocg_activate()
1320 list_add(&iocg->active_list, &ioc->active_iocgs); in iocg_activate()
1322 propagate_weights(iocg, iocg->weight, in iocg_activate()
1323 iocg->last_inuse ?: iocg->weight, true, now); in iocg_activate()
1328 iocg->activated_at = now->now; in iocg_activate()
1330 if (ioc->running == IOC_IDLE) { in iocg_activate()
1331 ioc->running = IOC_RUNNING; in iocg_activate()
1332 ioc->dfgv_period_at = now->now; in iocg_activate()
1333 ioc->dfgv_period_rem = 0; in iocg_activate()
1338 spin_unlock_irq(&ioc->lock); in iocg_activate()
1342 spin_unlock_irq(&ioc->lock); in iocg_activate()
1348 struct ioc *ioc = iocg->ioc; in iocg_kick_delay()
1354 lockdep_assert_held(&iocg->waitq.lock); in iocg_kick_delay()
1360 if (time_before64(now->now, iocg->delay_at)) in iocg_kick_delay()
1363 /* calculate the current delay in effect - 1/2 every second */ in iocg_kick_delay()
1364 tdelta = now->now - iocg->delay_at; in iocg_kick_delay()
1366 if (iocg->delay && shift < BITS_PER_LONG) in iocg_kick_delay()
1367 delay = iocg->delay >> shift; in iocg_kick_delay()
1373 vover = atomic64_read(&iocg->vtime) + in iocg_kick_delay()
1374 abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow; in iocg_kick_delay()
1376 ioc->period_us * ioc->vtime_base_rate); in iocg_kick_delay()
1384 div_u64((MAX_DELAY - MIN_DELAY) * in iocg_kick_delay()
1385 (vover_pct - MIN_DELAY_THR_PCT), in iocg_kick_delay()
1386 MAX_DELAY_THR_PCT - MIN_DELAY_THR_PCT); in iocg_kick_delay()
1390 iocg->delay = new_delay; in iocg_kick_delay()
1391 iocg->delay_at = now->now; in iocg_kick_delay()
1396 if (!iocg->indelay_since) in iocg_kick_delay()
1397 iocg->indelay_since = now->now; in iocg_kick_delay()
1401 if (iocg->indelay_since) { in iocg_kick_delay()
1402 iocg->stat.indelay_us += now->now - iocg->indelay_since; in iocg_kick_delay()
1403 iocg->indelay_since = 0; in iocg_kick_delay()
1405 iocg->delay = 0; in iocg_kick_delay()
1416 lockdep_assert_held(&iocg->ioc->lock); in iocg_incur_debt()
1417 lockdep_assert_held(&iocg->waitq.lock); in iocg_incur_debt()
1418 WARN_ON_ONCE(list_empty(&iocg->active_list)); in iocg_incur_debt()
1424 if (!iocg->abs_vdebt && abs_cost) { in iocg_incur_debt()
1425 iocg->indebt_since = now->now; in iocg_incur_debt()
1426 propagate_weights(iocg, iocg->active, 0, false, now); in iocg_incur_debt()
1429 iocg->abs_vdebt += abs_cost; in iocg_incur_debt()
1431 gcs = get_cpu_ptr(iocg->pcpu_stat); in iocg_incur_debt()
1432 local64_add(abs_cost, &gcs->abs_vusage); in iocg_incur_debt()
1439 lockdep_assert_held(&iocg->ioc->lock); in iocg_pay_debt()
1440 lockdep_assert_held(&iocg->waitq.lock); in iocg_pay_debt()
1443 * make sure that nobody messed with @iocg. Check iocg->pd.online in iocg_pay_debt()
1446 WARN_ON_ONCE(list_empty(&iocg->active_list) && iocg->pd.online); in iocg_pay_debt()
1447 WARN_ON_ONCE(iocg->inuse > 1); in iocg_pay_debt()
1449 iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt); in iocg_pay_debt()
1452 if (!iocg->abs_vdebt) { in iocg_pay_debt()
1453 iocg->stat.indebt_us += now->now - iocg->indebt_since; in iocg_pay_debt()
1454 iocg->indebt_since = 0; in iocg_pay_debt()
1456 propagate_weights(iocg, iocg->active, iocg->last_inuse, in iocg_pay_debt()
1466 u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse); in iocg_wake_fn() local
1468 ctx->vbudget -= cost; in iocg_wake_fn()
1470 if (ctx->vbudget < 0) in iocg_wake_fn()
1471 return -1; in iocg_wake_fn()
1473 iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost); in iocg_wake_fn()
1474 wait->committed = true; in iocg_wake_fn()
1484 list_del_init_careful(&wq_entry->entry); in iocg_wake_fn()
1490 * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in
1491 * addition to iocg->waitq.lock.
1496 struct ioc *ioc = iocg->ioc; in iocg_kick_waitq()
1502 lockdep_assert_held(&iocg->waitq.lock); in iocg_kick_waitq()
1505 vbudget = now->vnow - atomic64_read(&iocg->vtime); in iocg_kick_waitq()
1508 if (pay_debt && iocg->abs_vdebt && vbudget > 0) { in iocg_kick_waitq()
1510 u64 abs_vpay = min_t(u64, abs_vbudget, iocg->abs_vdebt); in iocg_kick_waitq()
1513 lockdep_assert_held(&ioc->lock); in iocg_kick_waitq()
1515 atomic64_add(vpay, &iocg->vtime); in iocg_kick_waitq()
1516 atomic64_add(vpay, &iocg->done_vtime); in iocg_kick_waitq()
1518 vbudget -= vpay; in iocg_kick_waitq()
1521 if (iocg->abs_vdebt || iocg->delay) in iocg_kick_waitq()
1530 if (iocg->abs_vdebt) { in iocg_kick_waitq()
1531 s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hwa); in iocg_kick_waitq()
1532 vbudget = min_t(s64, 0, vbudget - vdebt); in iocg_kick_waitq()
1543 __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx); in iocg_kick_waitq()
1545 if (!waitqueue_active(&iocg->waitq)) { in iocg_kick_waitq()
1546 if (iocg->wait_since) { in iocg_kick_waitq()
1547 iocg->stat.wait_us += now->now - iocg->wait_since; in iocg_kick_waitq()
1548 iocg->wait_since = 0; in iocg_kick_waitq()
1553 if (!iocg->wait_since) in iocg_kick_waitq()
1554 iocg->wait_since = now->now; in iocg_kick_waitq()
1560 vshortage = -ctx.vbudget; in iocg_kick_waitq()
1561 expires = now->now_ns + in iocg_kick_waitq()
1562 DIV64_U64_ROUND_UP(vshortage, ioc->vtime_base_rate) * in iocg_kick_waitq()
1564 expires += ioc->timer_slack_ns; in iocg_kick_waitq()
1567 oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer)); in iocg_kick_waitq()
1568 if (hrtimer_is_queued(&iocg->waitq_timer) && in iocg_kick_waitq()
1569 abs(oexpires - expires) <= ioc->timer_slack_ns) in iocg_kick_waitq()
1572 hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires), in iocg_kick_waitq()
1573 ioc->timer_slack_ns, HRTIMER_MODE_ABS); in iocg_kick_waitq()
1579 bool pay_debt = READ_ONCE(iocg->abs_vdebt); in iocg_waitq_timer_fn()
1583 ioc_now(iocg->ioc, &now); in iocg_waitq_timer_fn()
1600 struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu); in ioc_lat_stat()
1604 u32 this_met = local_read(&stat->missed[rw].nr_met); in ioc_lat_stat()
1605 u32 this_missed = local_read(&stat->missed[rw].nr_missed); in ioc_lat_stat()
1607 nr_met[rw] += this_met - stat->missed[rw].last_met; in ioc_lat_stat()
1608 nr_missed[rw] += this_missed - stat->missed[rw].last_missed; in ioc_lat_stat()
1609 stat->missed[rw].last_met = this_met; in ioc_lat_stat()
1610 stat->missed[rw].last_missed = this_missed; in ioc_lat_stat()
1613 this_rq_wait_ns = local64_read(&stat->rq_wait_ns); in ioc_lat_stat()
1614 rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns; in ioc_lat_stat()
1615 stat->last_rq_wait_ns = this_rq_wait_ns; in ioc_lat_stat()
1628 ioc->period_us * NSEC_PER_USEC); in ioc_lat_stat()
1634 struct ioc *ioc = iocg->ioc; in iocg_is_idle()
1637 if (atomic64_read(&iocg->active_period) == in iocg_is_idle()
1638 atomic64_read(&ioc->cur_period)) in iocg_is_idle()
1642 if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime)) in iocg_is_idle()
1649 * Call this function on the target leaf @iocg's to build pre-order traversal
1651 * ->walk_list and the caller is responsible for dissolving the list after use.
1658 WARN_ON_ONCE(!list_empty(&iocg->walk_list)); in iocg_build_inner_walk()
1661 for (lvl = iocg->level - 1; lvl >= 0; lvl--) { in iocg_build_inner_walk()
1662 if (!list_empty(&iocg->ancestors[lvl]->walk_list)) in iocg_build_inner_walk()
1666 /* walk down and visit the inner nodes to get pre-order traversal */ in iocg_build_inner_walk()
1667 while (++lvl <= iocg->level - 1) { in iocg_build_inner_walk()
1668 struct ioc_gq *inner = iocg->ancestors[lvl]; in iocg_build_inner_walk()
1671 list_add_tail(&inner->walk_list, inner_walk); in iocg_build_inner_walk()
1678 if (iocg->level > 0) { in iocg_flush_stat_upward()
1680 &iocg->ancestors[iocg->level - 1]->stat; in iocg_flush_stat_upward()
1682 parent_stat->usage_us += in iocg_flush_stat_upward()
1683 iocg->stat.usage_us - iocg->last_stat.usage_us; in iocg_flush_stat_upward()
1684 parent_stat->wait_us += in iocg_flush_stat_upward()
1685 iocg->stat.wait_us - iocg->last_stat.wait_us; in iocg_flush_stat_upward()
1686 parent_stat->indebt_us += in iocg_flush_stat_upward()
1687 iocg->stat.indebt_us - iocg->last_stat.indebt_us; in iocg_flush_stat_upward()
1688 parent_stat->indelay_us += in iocg_flush_stat_upward()
1689 iocg->stat.indelay_us - iocg->last_stat.indelay_us; in iocg_flush_stat_upward()
1692 iocg->last_stat = iocg->stat; in iocg_flush_stat_upward()
1695 /* collect per-cpu counters and propagate the deltas to the parent */
1698 struct ioc *ioc = iocg->ioc; in iocg_flush_stat_leaf()
1703 lockdep_assert_held(&iocg->ioc->lock); in iocg_flush_stat_leaf()
1705 /* collect per-cpu counters */ in iocg_flush_stat_leaf()
1708 per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu)); in iocg_flush_stat_leaf()
1710 vusage_delta = abs_vusage - iocg->last_stat_abs_vusage; in iocg_flush_stat_leaf()
1711 iocg->last_stat_abs_vusage = abs_vusage; in iocg_flush_stat_leaf()
1713 iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate); in iocg_flush_stat_leaf()
1714 iocg->stat.usage_us += iocg->usage_delta_us; in iocg_flush_stat_leaf()
1734 list_del_init(&iocg->walk_list); in iocg_flush_stat()
1746 struct ioc *ioc = iocg->ioc; in hweight_after_donation()
1747 u64 vtime = atomic64_read(&iocg->vtime); in hweight_after_donation()
1751 if (iocg->abs_vdebt) in hweight_after_donation()
1755 if (waitqueue_active(&iocg->waitq) || in hweight_after_donation()
1756 time_after64(vtime, now->vnow - ioc->margins.min)) in hweight_after_donation()
1760 excess = now->vnow - vtime - ioc->margins.target; in hweight_after_donation()
1762 atomic64_add(excess, &iocg->vtime); in hweight_after_donation()
1763 atomic64_add(excess, &iocg->done_vtime); in hweight_after_donation()
1765 ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE); in hweight_after_donation()
1775 * new budget (1 - MARGIN_TARGET) and the leftover from the last period in hweight_after_donation()
1778 * usage = (1 - MARGIN_TARGET + delta) * new_hwi in hweight_after_donation()
1782 * new_hwi = usage / (1 - MARGIN_TARGET + delta) in hweight_after_donation()
1784 delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime), in hweight_after_donation()
1785 now->vnow - ioc->period_at_vtime); in hweight_after_donation()
1787 new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta); in hweight_after_donation()
1793 * For work-conservation, an iocg which isn't using all of its share should
1794 * donate the leftover to other iocgs. There are two ways to achieve this - 1.
1815 * Given the weights and target after-donation hweight_inuse values, Andy's
1817 * sibling level to maintain the relative relationship between all non-donating
1819 * non-donating parts, calculates global donation rate which is used to
1820 * determine the target hweight_inuse for each node, and then derives per-level
1827 * https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE
1837 * f is the sum of the absolute budgets of non-donating nodes in the subtree.
1840 * w_f is the non-donating portion of w. w_f = w * f / b
1843 * s_f and s_t are the non-donating and donating portions of s.
1845 * Subscript p denotes the parent's counterpart and ' the adjusted value - e.g.
1869 after_sum += iocg->hweight_after_donation; in transfer_surpluses()
1871 if (iocg->hweight_after_donation > hwa) { in transfer_surpluses()
1872 over_sum += iocg->hweight_after_donation; in transfer_surpluses()
1873 list_add(&iocg->walk_list, &over_hwa); in transfer_surpluses()
1882 u32 over_delta = after_sum - (WEIGHT_ONE - 1); in transfer_surpluses()
1884 over_target = over_sum - over_delta; in transfer_surpluses()
1891 iocg->hweight_after_donation = in transfer_surpluses()
1892 div_u64((u64)iocg->hweight_after_donation * in transfer_surpluses()
1894 list_del_init(&iocg->walk_list); in transfer_surpluses()
1898 * Build pre-order inner node walk list and prepare for donation in transfer_surpluses()
1906 WARN_ON_ONCE(root_iocg->level > 0); in transfer_surpluses()
1909 iocg->child_adjusted_sum = 0; in transfer_surpluses()
1910 iocg->hweight_donating = 0; in transfer_surpluses()
1911 iocg->hweight_after_donation = 0; in transfer_surpluses()
1919 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; in transfer_surpluses()
1921 parent->hweight_donating += iocg->hweight_donating; in transfer_surpluses()
1922 parent->hweight_after_donation += iocg->hweight_after_donation; in transfer_surpluses()
1926 if (iocg->level > 0) { in transfer_surpluses()
1927 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; in transfer_surpluses()
1929 parent->hweight_donating += iocg->hweight_donating; in transfer_surpluses()
1930 parent->hweight_after_donation += iocg->hweight_after_donation; in transfer_surpluses()
1936 * within the accepted ranges as we're doing low res calculations with in transfer_surpluses()
1940 if (iocg->level) { in transfer_surpluses()
1941 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; in transfer_surpluses()
1943 iocg->hweight_active = DIV64_U64_ROUND_UP( in transfer_surpluses()
1944 (u64)parent->hweight_active * iocg->active, in transfer_surpluses()
1945 parent->child_active_sum); in transfer_surpluses()
1949 iocg->hweight_donating = min(iocg->hweight_donating, in transfer_surpluses()
1950 iocg->hweight_active); in transfer_surpluses()
1951 iocg->hweight_after_donation = min(iocg->hweight_after_donation, in transfer_surpluses()
1952 iocg->hweight_donating - 1); in transfer_surpluses()
1953 if (WARN_ON_ONCE(iocg->hweight_active <= 1 || in transfer_surpluses()
1954 iocg->hweight_donating <= 1 || in transfer_surpluses()
1955 iocg->hweight_after_donation == 0)) { in transfer_surpluses()
1957 pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup); in transfer_surpluses()
1959 iocg->hweight_active, iocg->hweight_donating, in transfer_surpluses()
1960 iocg->hweight_after_donation); in transfer_surpluses()
1965 * Calculate the global donation rate (gamma) - the rate to adjust in transfer_surpluses()
1966 * non-donating budgets by. in transfer_surpluses()
1972 * hweights can't be whole; however, due to the round-ups during hweight in transfer_surpluses()
1973 * calculations, root_iocg->hweight_donating might still end up equal to in transfer_surpluses()
1976 * gamma = (1 - t_r') / (1 - t_r) in transfer_surpluses()
1979 (WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE, in transfer_surpluses()
1980 WEIGHT_ONE - min_t(u32, root_iocg->hweight_donating, WEIGHT_ONE - 1)); in transfer_surpluses()
1991 if (iocg->level == 0) { in transfer_surpluses()
1993 iocg->child_adjusted_sum = DIV64_U64_ROUND_UP( in transfer_surpluses()
1994 iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating), in transfer_surpluses()
1995 WEIGHT_ONE - iocg->hweight_after_donation); in transfer_surpluses()
1999 parent = iocg->ancestors[iocg->level - 1]; in transfer_surpluses()
2002 iocg->hweight_inuse = DIV64_U64_ROUND_UP( in transfer_surpluses()
2003 (u64)gamma * (iocg->hweight_active - iocg->hweight_donating), in transfer_surpluses()
2004 WEIGHT_ONE) + iocg->hweight_after_donation; in transfer_surpluses()
2008 (u64)parent->child_adjusted_sum * iocg->hweight_inuse, in transfer_surpluses()
2009 parent->hweight_inuse); in transfer_surpluses()
2013 iocg->child_active_sum * iocg->hweight_donating, in transfer_surpluses()
2014 iocg->hweight_active); in transfer_surpluses()
2015 sf = iocg->child_active_sum - st; in transfer_surpluses()
2017 (u64)iocg->active * iocg->hweight_donating, in transfer_surpluses()
2018 iocg->hweight_active); in transfer_surpluses()
2020 (u64)inuse * iocg->hweight_after_donation, in transfer_surpluses()
2021 iocg->hweight_inuse); in transfer_surpluses()
2023 iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt); in transfer_surpluses()
2027 * All inner nodes now have ->hweight_inuse and ->child_adjusted_sum and in transfer_surpluses()
2031 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; in transfer_surpluses()
2035 * In-debt iocgs participated in the donation calculation with in transfer_surpluses()
2038 * @iocg->inuse stay at the minimum and we don't wanna in transfer_surpluses()
2041 if (iocg->abs_vdebt) { in transfer_surpluses()
2042 WARN_ON_ONCE(iocg->inuse > 1); in transfer_surpluses()
2048 parent->child_adjusted_sum * iocg->hweight_after_donation, in transfer_surpluses()
2049 parent->hweight_inuse); in transfer_surpluses()
2052 iocg->inuse, inuse, in transfer_surpluses()
2053 iocg->hweight_inuse, in transfer_surpluses()
2054 iocg->hweight_after_donation); in transfer_surpluses()
2056 __propagate_weights(iocg, iocg->active, inuse, true, now); in transfer_surpluses()
2061 list_del_init(&iocg->walk_list); in transfer_surpluses()
2065 * A low weight iocg can amass a large amount of debt, for example, when
2068 * more. If there are no other subsequent IO issuers, the in-debt iocg may end
2083 ioc->dfgv_period_at = now->now; in ioc_forgive_debts()
2084 ioc->dfgv_period_rem = 0; in ioc_forgive_debts()
2085 ioc->dfgv_usage_us_sum = 0; in ioc_forgive_debts()
2095 if (ioc->busy_level > 0) in ioc_forgive_debts()
2096 usage_us_sum = max_t(u64, usage_us_sum, ioc->period_us); in ioc_forgive_debts()
2098 ioc->dfgv_usage_us_sum += usage_us_sum; in ioc_forgive_debts()
2099 if (time_before64(now->now, ioc->dfgv_period_at + DFGV_PERIOD)) in ioc_forgive_debts()
2106 dur = now->now - ioc->dfgv_period_at; in ioc_forgive_debts()
2107 usage_pct = div64_u64(100 * ioc->dfgv_usage_us_sum, dur); in ioc_forgive_debts()
2109 ioc->dfgv_period_at = now->now; in ioc_forgive_debts()
2110 ioc->dfgv_usage_us_sum = 0; in ioc_forgive_debts()
2114 ioc->dfgv_period_rem = 0; in ioc_forgive_debts()
2123 * run and carrying over the left-over duration in @ioc->dfgv_period_rem in ioc_forgive_debts()
2124 * - if ioc period is 75% of DFGV_PERIOD, one out of three consecutive in ioc_forgive_debts()
2127 nr_cycles = dur + ioc->dfgv_period_rem; in ioc_forgive_debts()
2128 ioc->dfgv_period_rem = do_div(nr_cycles, DFGV_PERIOD); in ioc_forgive_debts()
2130 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { in ioc_forgive_debts()
2133 if (!iocg->abs_vdebt && !iocg->delay) in ioc_forgive_debts()
2136 spin_lock(&iocg->waitq.lock); in ioc_forgive_debts()
2138 old_debt = iocg->abs_vdebt; in ioc_forgive_debts()
2139 old_delay = iocg->delay; in ioc_forgive_debts()
2141 nr_cycles_shift = min_t(u64, nr_cycles, BITS_PER_LONG - 1); in ioc_forgive_debts()
2142 if (iocg->abs_vdebt) in ioc_forgive_debts()
2143 iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles_shift ?: 1; in ioc_forgive_debts()
2145 if (iocg->delay) in ioc_forgive_debts()
2146 iocg->delay = iocg->delay >> nr_cycles_shift ?: 1; in ioc_forgive_debts()
2151 old_debt, iocg->abs_vdebt, in ioc_forgive_debts()
2152 old_delay, iocg->delay); in ioc_forgive_debts()
2154 spin_unlock(&iocg->waitq.lock); in ioc_forgive_debts()
2173 list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { in ioc_check_iocgs()
2174 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && in ioc_check_iocgs()
2175 !iocg->delay && !iocg_is_idle(iocg)) in ioc_check_iocgs()
2178 spin_lock(&iocg->waitq.lock); in ioc_check_iocgs()
2181 if (iocg->wait_since) { in ioc_check_iocgs()
2182 iocg->stat.wait_us += now->now - iocg->wait_since; in ioc_check_iocgs()
2183 iocg->wait_since = now->now; in ioc_check_iocgs()
2185 if (iocg->indebt_since) { in ioc_check_iocgs()
2186 iocg->stat.indebt_us += in ioc_check_iocgs()
2187 now->now - iocg->indebt_since; in ioc_check_iocgs()
2188 iocg->indebt_since = now->now; in ioc_check_iocgs()
2190 if (iocg->indelay_since) { in ioc_check_iocgs()
2191 iocg->stat.indelay_us += in ioc_check_iocgs()
2192 now->now - iocg->indelay_since; in ioc_check_iocgs()
2193 iocg->indelay_since = now->now; in ioc_check_iocgs()
2196 if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt || in ioc_check_iocgs()
2197 iocg->delay) { in ioc_check_iocgs()
2200 if (iocg->abs_vdebt || iocg->delay) in ioc_check_iocgs()
2204 u64 vtime = atomic64_read(&iocg->vtime); in ioc_check_iocgs()
2213 excess = now->vnow - vtime - ioc->margins.target; in ioc_check_iocgs()
2218 ioc->vtime_err -= div64_u64(excess * old_hwi, in ioc_check_iocgs()
2223 atomic64_read(&iocg->active_period), in ioc_check_iocgs()
2224 atomic64_read(&ioc->cur_period), vtime); in ioc_check_iocgs()
2226 list_del_init(&iocg->active_list); in ioc_check_iocgs()
2229 spin_unlock(&iocg->waitq.lock); in ioc_check_iocgs()
2254 spin_lock_irq(&ioc->lock); in ioc_timer_fn()
2256 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; in ioc_timer_fn()
2257 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; in ioc_timer_fn()
2260 period_vtime = now.vnow - ioc->period_at_vtime; in ioc_timer_fn()
2262 spin_unlock_irq(&ioc->lock); in ioc_timer_fn()
2270 * below needs updated usage stat. Let's bring stat up-to-date. in ioc_timer_fn()
2272 iocg_flush_stat(&ioc->active_iocgs, &now); in ioc_timer_fn()
2275 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { in ioc_timer_fn()
2283 vdone = atomic64_read(&iocg->done_vtime); in ioc_timer_fn()
2284 vtime = atomic64_read(&iocg->vtime); in ioc_timer_fn()
2289 * in-flight for longer than a period. Detect them by in ioc_timer_fn()
2294 !atomic_read(&iocg_to_blkg(iocg)->use_delay) && in ioc_timer_fn()
2296 time_after64(vtime, now.vnow - in ioc_timer_fn()
2298 time_before64(vdone, now.vnow - period_vtime)) in ioc_timer_fn()
2302 * Determine absolute usage factoring in in-flight IOs to avoid in ioc_timer_fn()
2303 * high-latency completions appearing as idle. in ioc_timer_fn()
2305 usage_us = iocg->usage_delta_us; in ioc_timer_fn()
2309 WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); in ioc_timer_fn()
2311 (!waitqueue_active(&iocg->waitq) && in ioc_timer_fn()
2312 time_before64(vtime, now.vnow - ioc->margins.low))) { in ioc_timer_fn()
2318 cost_to_abs_cost(vtime - vdone, hw_inuse), in ioc_timer_fn()
2319 ioc->vtime_base_rate); in ioc_timer_fn()
2325 if (time_after64(iocg->activated_at, ioc->period_at)) in ioc_timer_fn()
2326 usage_dur = max_t(u64, now.now - iocg->activated_at, 1); in ioc_timer_fn()
2328 usage_dur = max_t(u64, now.now - ioc->period_at, 1); in ioc_timer_fn()
2351 iocg->hweight_donating = hwa; in ioc_timer_fn()
2352 iocg->hweight_after_donation = new_hwi; in ioc_timer_fn()
2353 list_add(&iocg->surplus_list, &surpluses); in ioc_timer_fn()
2354 } else if (!iocg->abs_vdebt) { in ioc_timer_fn()
2366 iocg->inuse, iocg->active, in ioc_timer_fn()
2367 iocg->hweight_inuse, new_hwi); in ioc_timer_fn()
2369 __propagate_weights(iocg, iocg->active, in ioc_timer_fn()
2370 iocg->active, true, &now); in ioc_timer_fn()
2386 list_del_init(&iocg->surplus_list); in ioc_timer_fn()
2394 prev_busy_level = ioc->busy_level; in ioc_timer_fn()
2399 ioc->busy_level = max(ioc->busy_level, 0); in ioc_timer_fn()
2400 ioc->busy_level++; in ioc_timer_fn()
2410 ioc->busy_level = min(ioc->busy_level, 0); in ioc_timer_fn()
2417 ioc->busy_level--; in ioc_timer_fn()
2425 ioc->busy_level = 0; in ioc_timer_fn()
2429 ioc->busy_level = 0; in ioc_timer_fn()
2432 ioc->busy_level = clamp(ioc->busy_level, -1000, 1000); in ioc_timer_fn()
2445 atomic64_inc(&ioc->cur_period); in ioc_timer_fn()
2447 if (ioc->running != IOC_STOP) { in ioc_timer_fn()
2448 if (!list_empty(&ioc->active_iocgs)) { in ioc_timer_fn()
2451 ioc->busy_level = 0; in ioc_timer_fn()
2452 ioc->vtime_err = 0; in ioc_timer_fn()
2453 ioc->running = IOC_IDLE; in ioc_timer_fn()
2459 spin_unlock_irq(&ioc->lock); in ioc_timer_fn()
2465 struct ioc *ioc = iocg->ioc; in adjust_inuse_and_calc_cost()
2466 struct ioc_margins *margins = &ioc->margins; in adjust_inuse_and_calc_cost()
2467 u32 __maybe_unused old_inuse = iocg->inuse, __maybe_unused old_hwi; in adjust_inuse_and_calc_cost()
2470 u64 cost, new_inuse; in adjust_inuse_and_calc_cost() local
2475 cost = abs_cost_to_cost(abs_cost, hwi); in adjust_inuse_and_calc_cost()
2476 margin = now->vnow - vtime - cost; in adjust_inuse_and_calc_cost()
2479 if (iocg->abs_vdebt) in adjust_inuse_and_calc_cost()
2480 return cost; in adjust_inuse_and_calc_cost()
2486 if (margin >= iocg->saved_margin || margin >= margins->low || in adjust_inuse_and_calc_cost()
2487 iocg->inuse == iocg->active) in adjust_inuse_and_calc_cost()
2488 return cost; in adjust_inuse_and_calc_cost()
2490 spin_lock_irqsave(&ioc->lock, flags); in adjust_inuse_and_calc_cost()
2493 if (iocg->abs_vdebt || list_empty(&iocg->active_list)) { in adjust_inuse_and_calc_cost()
2494 spin_unlock_irqrestore(&ioc->lock, flags); in adjust_inuse_and_calc_cost()
2495 return cost; in adjust_inuse_and_calc_cost()
2500 * adj_step must be determined after acquiring ioc->lock - we might in adjust_inuse_and_calc_cost()
2502 * be reading 0 iocg->active before ioc->lock which will lead to in adjust_inuse_and_calc_cost()
2505 new_inuse = iocg->inuse; in adjust_inuse_and_calc_cost()
2506 adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100); in adjust_inuse_and_calc_cost()
2509 propagate_weights(iocg, iocg->active, new_inuse, true, now); in adjust_inuse_and_calc_cost()
2511 cost = abs_cost_to_cost(abs_cost, hwi); in adjust_inuse_and_calc_cost()
2512 } while (time_after64(vtime + cost, now->vnow) && in adjust_inuse_and_calc_cost()
2513 iocg->inuse != iocg->active); in adjust_inuse_and_calc_cost()
2515 spin_unlock_irqrestore(&ioc->lock, flags); in adjust_inuse_and_calc_cost()
2518 old_inuse, iocg->inuse, old_hwi, hwi); in adjust_inuse_and_calc_cost()
2520 return cost; in adjust_inuse_and_calc_cost()
2526 struct ioc *ioc = iocg->ioc; in calc_vtime_cost_builtin()
2530 u64 cost = 0; in calc_vtime_cost_builtin() local
2532 /* Can't calculate cost for empty bio */ in calc_vtime_cost_builtin()
2533 if (!bio->bi_iter.bi_size) in calc_vtime_cost_builtin()
2538 coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO]; in calc_vtime_cost_builtin()
2539 coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO]; in calc_vtime_cost_builtin()
2540 coef_page = ioc->params.lcoefs[LCOEF_RPAGE]; in calc_vtime_cost_builtin()
2543 coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO]; in calc_vtime_cost_builtin()
2544 coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO]; in calc_vtime_cost_builtin()
2545 coef_page = ioc->params.lcoefs[LCOEF_WPAGE]; in calc_vtime_cost_builtin()
2551 if (iocg->cursor) { in calc_vtime_cost_builtin()
2552 seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor); in calc_vtime_cost_builtin()
2558 cost += coef_randio; in calc_vtime_cost_builtin()
2560 cost += coef_seqio; in calc_vtime_cost_builtin()
2563 cost += pages * coef_page; in calc_vtime_cost_builtin()
2565 *costp = cost; in calc_vtime_cost_builtin()
2570 u64 cost; in calc_vtime_cost() local
2572 calc_vtime_cost_builtin(bio, iocg, is_merge, &cost); in calc_vtime_cost()
2573 return cost; in calc_vtime_cost()
2583 *costp = pages * ioc->params.lcoefs[LCOEF_RPAGE]; in calc_size_vtime_cost_builtin()
2586 *costp = pages * ioc->params.lcoefs[LCOEF_WPAGE]; in calc_size_vtime_cost_builtin()
2595 u64 cost; in calc_size_vtime_cost() local
2597 calc_size_vtime_cost_builtin(rq, ioc, &cost); in calc_size_vtime_cost()
2598 return cost; in calc_size_vtime_cost()
2603 struct blkcg_gq *blkg = bio->bi_blkg; in ioc_rqos_throttle()
2608 u64 abs_cost, cost, vtime; in ioc_rqos_throttle() local
2613 if (!ioc->enabled || !iocg || !iocg->level) in ioc_rqos_throttle()
2616 /* calculate the absolute vtime cost */ in ioc_rqos_throttle()
2624 iocg->cursor = bio_end_sector(bio); in ioc_rqos_throttle()
2625 vtime = atomic64_read(&iocg->vtime); in ioc_rqos_throttle()
2626 cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now); in ioc_rqos_throttle()
2630 * tests are racy but the races aren't systemic - we only miss once in ioc_rqos_throttle()
2633 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && in ioc_rqos_throttle()
2634 time_before_eq64(vtime + cost, now.vnow)) { in ioc_rqos_throttle()
2635 iocg_commit_bio(iocg, bio, abs_cost, cost); in ioc_rqos_throttle()
2641 * cause priority inversions are punted to @ioc->aux_iocg and charged as in ioc_rqos_throttle()
2642 * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling in ioc_rqos_throttle()
2643 * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine in ioc_rqos_throttle()
2647 ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt); in ioc_rqos_throttle()
2653 * is synchronized against both ioc->lock and waitq.lock and we won't in ioc_rqos_throttle()
2658 if (unlikely(list_empty(&iocg->active_list))) { in ioc_rqos_throttle()
2660 iocg_commit_bio(iocg, bio, abs_cost, cost); in ioc_rqos_throttle()
2684 blkcg_schedule_throttle(rqos->disk, in ioc_rqos_throttle()
2685 (bio->bi_opf & REQ_SWAP) == REQ_SWAP); in ioc_rqos_throttle()
2691 if (!iocg->abs_vdebt && iocg->inuse != iocg->active) { in ioc_rqos_throttle()
2697 propagate_weights(iocg, iocg->active, iocg->active, true, in ioc_rqos_throttle()
2705 * or too long. Each wait entry records the absolute cost it's in ioc_rqos_throttle()
2706 * waiting for to allow re-evaluation using a custom wait entry. in ioc_rqos_throttle()
2711 * All waiters are on iocg->waitq and the wait states are in ioc_rqos_throttle()
2720 __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait); in ioc_rqos_throttle()
2733 finish_wait(&iocg->waitq, &wait.wait); in ioc_rqos_throttle()
2739 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg); in ioc_rqos_merge()
2743 u64 vtime, abs_cost, cost; in ioc_rqos_merge() local
2747 if (!ioc->enabled || !iocg || !iocg->level) in ioc_rqos_merge()
2756 vtime = atomic64_read(&iocg->vtime); in ioc_rqos_merge()
2757 cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now); in ioc_rqos_merge()
2761 blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor) in ioc_rqos_merge()
2762 iocg->cursor = bio_end; in ioc_rqos_merge()
2766 * cost assigned. in ioc_rqos_merge()
2768 if (rq->bio && rq->bio->bi_iocost_cost && in ioc_rqos_merge()
2769 time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) { in ioc_rqos_merge()
2770 iocg_commit_bio(iocg, bio, abs_cost, cost); in ioc_rqos_merge()
2779 spin_lock_irqsave(&ioc->lock, flags); in ioc_rqos_merge()
2780 spin_lock(&iocg->waitq.lock); in ioc_rqos_merge()
2782 if (likely(!list_empty(&iocg->active_list))) { in ioc_rqos_merge()
2785 blkcg_schedule_throttle(rqos->disk, in ioc_rqos_merge()
2786 (bio->bi_opf & REQ_SWAP) == REQ_SWAP); in ioc_rqos_merge()
2788 iocg_commit_bio(iocg, bio, abs_cost, cost); in ioc_rqos_merge()
2791 spin_unlock(&iocg->waitq.lock); in ioc_rqos_merge()
2792 spin_unlock_irqrestore(&ioc->lock, flags); in ioc_rqos_merge()
2797 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg); in ioc_rqos_done_bio()
2799 if (iocg && bio->bi_iocost_cost) in ioc_rqos_done_bio()
2800 atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime); in ioc_rqos_done_bio()
2810 if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns) in ioc_rqos_done()
2826 on_q_ns = blk_time_get_ns() - rq->alloc_time_ns; in ioc_rqos_done()
2827 rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns; in ioc_rqos_done()
2830 ccs = get_cpu_ptr(ioc->pcpu_stat); in ioc_rqos_done()
2833 on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC) in ioc_rqos_done()
2834 local_inc(&ccs->missed[rw].nr_met); in ioc_rqos_done()
2836 local_inc(&ccs->missed[rw].nr_missed); in ioc_rqos_done()
2838 local64_add(rq_wait_ns, &ccs->rq_wait_ns); in ioc_rqos_done()
2847 spin_lock_irq(&ioc->lock); in ioc_rqos_queue_depth_changed()
2849 spin_unlock_irq(&ioc->lock); in ioc_rqos_queue_depth_changed()
2856 blkcg_deactivate_policy(rqos->disk, &blkcg_policy_iocost); in ioc_rqos_exit()
2858 spin_lock_irq(&ioc->lock); in ioc_rqos_exit()
2859 ioc->running = IOC_STOP; in ioc_rqos_exit()
2860 spin_unlock_irq(&ioc->lock); in ioc_rqos_exit()
2862 timer_shutdown_sync(&ioc->timer); in ioc_rqos_exit()
2863 free_percpu(ioc->pcpu_stat); in ioc_rqos_exit()
2883 return -ENOMEM; in blk_iocost_init()
2885 ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat); in blk_iocost_init()
2886 if (!ioc->pcpu_stat) { in blk_iocost_init()
2888 return -ENOMEM; in blk_iocost_init()
2892 struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu); in blk_iocost_init()
2894 for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) { in blk_iocost_init()
2895 local_set(&ccs->missed[i].nr_met, 0); in blk_iocost_init()
2896 local_set(&ccs->missed[i].nr_missed, 0); in blk_iocost_init()
2898 local64_set(&ccs->rq_wait_ns, 0); in blk_iocost_init()
2901 spin_lock_init(&ioc->lock); in blk_iocost_init()
2902 timer_setup(&ioc->timer, ioc_timer_fn, 0); in blk_iocost_init()
2903 INIT_LIST_HEAD(&ioc->active_iocgs); in blk_iocost_init()
2905 ioc->running = IOC_IDLE; in blk_iocost_init()
2906 ioc->vtime_base_rate = VTIME_PER_USEC; in blk_iocost_init()
2907 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); in blk_iocost_init()
2908 seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock); in blk_iocost_init()
2909 ioc->period_at = ktime_to_us(blk_time_get()); in blk_iocost_init()
2910 atomic64_set(&ioc->cur_period, 0); in blk_iocost_init()
2911 atomic_set(&ioc->hweight_gen, 0); in blk_iocost_init()
2913 spin_lock_irq(&ioc->lock); in blk_iocost_init()
2914 ioc->autop_idx = AUTOP_INVALID; in blk_iocost_init()
2916 spin_unlock_irq(&ioc->lock); in blk_iocost_init()
2924 ret = rq_qos_add(&ioc->rqos, disk, RQ_QOS_COST, &ioc_rqos_ops); in blk_iocost_init()
2934 rq_qos_del(&ioc->rqos); in blk_iocost_init()
2936 free_percpu(ioc->pcpu_stat); in blk_iocost_init()
2949 iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE; in ioc_cpd_alloc()
2950 return &iocc->cpd; in ioc_cpd_alloc()
2961 int levels = blkcg->css.cgroup->level + 1; in ioc_pd_alloc()
2965 disk->node_id); in ioc_pd_alloc()
2969 iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp); in ioc_pd_alloc()
2970 if (!iocg->pcpu_stat) { in ioc_pd_alloc()
2975 return &iocg->pd; in ioc_pd_alloc()
2981 struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd); in ioc_pd_init()
2982 struct ioc *ioc = q_to_ioc(blkg->q); in ioc_pd_init()
2989 iocg->ioc = ioc; in ioc_pd_init()
2990 atomic64_set(&iocg->vtime, now.vnow); in ioc_pd_init()
2991 atomic64_set(&iocg->done_vtime, now.vnow); in ioc_pd_init()
2992 atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period)); in ioc_pd_init()
2993 INIT_LIST_HEAD(&iocg->active_list); in ioc_pd_init()
2994 INIT_LIST_HEAD(&iocg->walk_list); in ioc_pd_init()
2995 INIT_LIST_HEAD(&iocg->surplus_list); in ioc_pd_init()
2996 iocg->hweight_active = WEIGHT_ONE; in ioc_pd_init()
2997 iocg->hweight_inuse = WEIGHT_ONE; in ioc_pd_init()
2999 init_waitqueue_head(&iocg->waitq); in ioc_pd_init()
3000 hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); in ioc_pd_init()
3001 iocg->waitq_timer.function = iocg_waitq_timer_fn; in ioc_pd_init()
3003 iocg->level = blkg->blkcg->css.cgroup->level; in ioc_pd_init()
3005 for (tblkg = blkg; tblkg; tblkg = tblkg->parent) { in ioc_pd_init()
3007 iocg->ancestors[tiocg->level] = tiocg; in ioc_pd_init()
3010 spin_lock_irqsave(&ioc->lock, flags); in ioc_pd_init()
3012 spin_unlock_irqrestore(&ioc->lock, flags); in ioc_pd_init()
3018 struct ioc *ioc = iocg->ioc; in ioc_pd_free()
3022 spin_lock_irqsave(&ioc->lock, flags); in ioc_pd_free()
3024 if (!list_empty(&iocg->active_list)) { in ioc_pd_free()
3029 list_del_init(&iocg->active_list); in ioc_pd_free()
3032 WARN_ON_ONCE(!list_empty(&iocg->walk_list)); in ioc_pd_free()
3033 WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); in ioc_pd_free()
3035 spin_unlock_irqrestore(&ioc->lock, flags); in ioc_pd_free()
3037 hrtimer_cancel(&iocg->waitq_timer); in ioc_pd_free()
3039 free_percpu(iocg->pcpu_stat); in ioc_pd_free()
3046 struct ioc *ioc = iocg->ioc; in ioc_pd_stat()
3048 if (!ioc->enabled) in ioc_pd_stat()
3051 if (iocg->level == 0) { in ioc_pd_stat()
3053 ioc->vtime_base_rate * 10000, in ioc_pd_stat()
3055 seq_printf(s, " cost.vrate=%u.%02u", vp10k / 100, vp10k % 100); in ioc_pd_stat()
3058 seq_printf(s, " cost.usage=%llu", iocg->last_stat.usage_us); in ioc_pd_stat()
3061 seq_printf(s, " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu", in ioc_pd_stat()
3062 iocg->last_stat.wait_us, in ioc_pd_stat()
3063 iocg->last_stat.indebt_us, in ioc_pd_stat()
3064 iocg->last_stat.indelay_us); in ioc_pd_stat()
3070 const char *dname = blkg_dev_name(pd->blkg); in ioc_weight_prfill()
3073 if (dname && iocg->cfg_weight) in ioc_weight_prfill()
3074 seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE); in ioc_weight_prfill()
3084 seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE); in ioc_weight_show()
3086 &blkcg_policy_iocost, seq_cft(sf)->private, false); in ioc_weight_show()
3105 return -EINVAL; in ioc_weight_write()
3108 return -EINVAL; in ioc_weight_write()
3110 spin_lock_irq(&blkcg->lock); in ioc_weight_write()
3111 iocc->dfl_weight = v * WEIGHT_ONE; in ioc_weight_write()
3112 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { in ioc_weight_write()
3116 spin_lock(&iocg->ioc->lock); in ioc_weight_write()
3117 ioc_now(iocg->ioc, &now); in ioc_weight_write()
3119 spin_unlock(&iocg->ioc->lock); in ioc_weight_write()
3122 spin_unlock_irq(&blkcg->lock); in ioc_weight_write()
3144 spin_lock(&iocg->ioc->lock); in ioc_weight_write()
3145 iocg->cfg_weight = v * WEIGHT_ONE; in ioc_weight_write()
3146 ioc_now(iocg->ioc, &now); in ioc_weight_write()
3148 spin_unlock(&iocg->ioc->lock); in ioc_weight_write()
3154 ret = -EINVAL; in ioc_weight_write()
3163 const char *dname = blkg_dev_name(pd->blkg); in ioc_qos_prfill()
3164 struct ioc *ioc = pd_to_iocg(pd)->ioc; in ioc_qos_prfill()
3169 spin_lock(&ioc->lock); in ioc_qos_prfill()
3171 dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto", in ioc_qos_prfill()
3172 ioc->params.qos[QOS_RPPM] / 10000, in ioc_qos_prfill()
3173 ioc->params.qos[QOS_RPPM] % 10000 / 100, in ioc_qos_prfill()
3174 ioc->params.qos[QOS_RLAT], in ioc_qos_prfill()
3175 ioc->params.qos[QOS_WPPM] / 10000, in ioc_qos_prfill()
3176 ioc->params.qos[QOS_WPPM] % 10000 / 100, in ioc_qos_prfill()
3177 ioc->params.qos[QOS_WLAT], in ioc_qos_prfill()
3178 ioc->params.qos[QOS_MIN] / 10000, in ioc_qos_prfill()
3179 ioc->params.qos[QOS_MIN] % 10000 / 100, in ioc_qos_prfill()
3180 ioc->params.qos[QOS_MAX] / 10000, in ioc_qos_prfill()
3181 ioc->params.qos[QOS_MAX] % 10000 / 100); in ioc_qos_prfill()
3182 spin_unlock(&ioc->lock); in ioc_qos_prfill()
3191 &blkcg_policy_iocost, seq_cft(sf)->private, false); in ioc_qos_show()
3229 disk = ctx.bdev->bd_disk; in ioc_qos_write()
3230 if (!queue_is_mq(disk->queue)) { in ioc_qos_write()
3231 ret = -EOPNOTSUPP; in ioc_qos_write()
3235 ioc = q_to_ioc(disk->queue); in ioc_qos_write()
3240 ioc = q_to_ioc(disk->queue); in ioc_qos_write()
3243 blk_mq_freeze_queue(disk->queue); in ioc_qos_write()
3244 blk_mq_quiesce_queue(disk->queue); in ioc_qos_write()
3246 spin_lock_irq(&ioc->lock); in ioc_qos_write()
3247 memcpy(qos, ioc->params.qos, sizeof(qos)); in ioc_qos_write()
3248 enable = ioc->enabled; in ioc_qos_write()
3249 user = ioc->user_qos_params; in ioc_qos_write()
3317 if (enable && !ioc->enabled) { in ioc_qos_write()
3318 blk_stat_enable_accounting(disk->queue); in ioc_qos_write()
3319 blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); in ioc_qos_write()
3320 ioc->enabled = true; in ioc_qos_write()
3321 } else if (!enable && ioc->enabled) { in ioc_qos_write()
3322 blk_stat_disable_accounting(disk->queue); in ioc_qos_write()
3323 blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); in ioc_qos_write()
3324 ioc->enabled = false; in ioc_qos_write()
3328 memcpy(ioc->params.qos, qos, sizeof(qos)); in ioc_qos_write()
3329 ioc->user_qos_params = true; in ioc_qos_write()
3331 ioc->user_qos_params = false; in ioc_qos_write()
3335 spin_unlock_irq(&ioc->lock); in ioc_qos_write()
3342 blk_mq_unquiesce_queue(disk->queue); in ioc_qos_write()
3343 blk_mq_unfreeze_queue(disk->queue); in ioc_qos_write()
3348 spin_unlock_irq(&ioc->lock); in ioc_qos_write()
3350 blk_mq_unquiesce_queue(disk->queue); in ioc_qos_write()
3351 blk_mq_unfreeze_queue(disk->queue); in ioc_qos_write()
3353 ret = -EINVAL; in ioc_qos_write()
3362 const char *dname = blkg_dev_name(pd->blkg); in ioc_cost_model_prfill()
3363 struct ioc *ioc = pd_to_iocg(pd)->ioc; in ioc_cost_model_prfill()
3364 u64 *u = ioc->params.i_lcoefs; in ioc_cost_model_prfill()
3369 spin_lock(&ioc->lock); in ioc_cost_model_prfill()
3373 dname, ioc->user_cost_model ? "user" : "auto", in ioc_cost_model_prfill()
3376 spin_unlock(&ioc->lock); in ioc_cost_model_prfill()
3385 &blkcg_policy_iocost, seq_cft(sf)->private, false); in ioc_cost_model_show()
3425 ret = -EOPNOTSUPP; in ioc_cost_model_write()
3431 ret = blk_iocost_init(ctx.bdev->bd_disk); in ioc_cost_model_write()
3440 spin_lock_irq(&ioc->lock); in ioc_cost_model_write()
3441 memcpy(u, ioc->params.i_lcoefs, sizeof(u)); in ioc_cost_model_write()
3442 user = ioc->user_cost_model; in ioc_cost_model_write()
3480 memcpy(ioc->params.i_lcoefs, u, sizeof(u)); in ioc_cost_model_write()
3481 ioc->user_cost_model = true; in ioc_cost_model_write()
3483 ioc->user_cost_model = false; in ioc_cost_model_write()
3486 spin_unlock_irq(&ioc->lock); in ioc_cost_model_write()
3495 spin_unlock_irq(&ioc->lock); in ioc_cost_model_write()
3500 ret = -EINVAL; in ioc_cost_model_write()
3514 .name = "cost.qos",
3520 .name = "cost.model",