| /* |
| * tiler-nv12.c |
| * |
| * TILER driver NV12 area reservation functions for TI TILER hardware block. |
| * |
| * Author: Lajos Molnar <molnar@ti.com> |
| * |
| * Copyright (C) 2009-2010 Texas Instruments, Inc. |
| * |
| * This package is free software; you can redistribute it and/or modify |
| * it under the terms of the GNU General Public License version 2 as |
| * published by the Free Software Foundation. |
| * |
| * THIS PACKAGE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR |
| * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED |
| * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. |
| */ |
| |
| #include "_tiler.h" |
| |
| static struct tiler_ops *ops; /* shared methods and variables */ |
| static int band_8; |
| static int band_16; |
| |
| /* |
| * NV12 Reservation Functions |
| * |
| * TILER is designed so that a (w * h) * 8bit area is twice as wide as a |
| * (w/2 * h/2) * 16bit area. Since having pairs of such 8-bit and 16-bit |
| * blocks is a common usecase for TILER, we optimize packing these into a |
| * TILER area. |
| * |
| * During reservation we want to find the most effective packing (most used area |
| * in the smallest overall area) |
| * |
| * We have two algorithms for packing nv12 blocks: either pack 8- and 16-bit |
| * blocks into separate container areas, or pack them together into same area. |
| */ |
| |
| /** |
| * Calculate effectiveness of packing. We weight total area much higher than |
| * packing efficiency to get the smallest overall container use. |
| * |
| * @param w width of one (8-bit) block |
| * @param n buffers in a packing |
| * @param area width of packing area |
| * @param n_total total number of buffers to be packed |
| * @return effectiveness, the higher the better |
| */ |
| static inline u32 nv12_eff(u16 w, u16 n, u16 area, u16 n_total) |
| { |
| return 0x10000000 - |
| /* weigh against total area needed (for all buffers) */ |
| /* 64-slots = -2048 */ |
| DIV_ROUND_UP(n_total, n) * area * 32 + |
| /* packing efficiency (0 - 1024) */ |
| 1024 * n * ((w * 3 + 1) >> 1) / area; |
| } |
| |
| /** |
| * Fallback nv12 packing algorithm: pack 8 and 16 bit block into separate |
| * areas. |
| * |
| * @author a0194118 (7/16/2010) |
| * |
| * @param o desired offset (<a) |
| * @param a desired alignment (>=2) |
| * @param w block width (>0) |
| * @param n number of blocks desired |
| * @param area pointer to store total area needed |
| * |
| * @return number of blocks that can be allocated |
| */ |
| static u16 nv12_separate(u16 o, u16 a, u16 w, u16 n, u16 *area) |
| { |
| tiler_best2pack(o, a, band_8, w, &n, area); |
| tiler_best2pack(o >> 1, a >> 1, band_16, (w + 1) >> 1, &n, area); |
| *area *= 3; |
| return n; |
| } |
| |
| /* |
| * Specialized NV12 Reservation Algorithms |
| * |
| * We use 4 packing methods that pack nv12 blocks into the same area. Together |
| * these 4 methods give the optimal result for most possible input parameters. |
| * |
| * For now we pack into a 64-slot area, so that we don't have to worry about |
| * stride issues (all blocks get 4K stride). For some of the algorithms this |
| * could be true even if the area was 128. |
| */ |
| |
| /** |
| * Packing types are marked using a letter sequence, capital letters denoting |
| * 8-bit blocks, lower case letters denoting corresponding 16-bit blocks. |
| * |
| * All methods have the following parameters. They also define the maximum |
| * number of coordinates that could potentially be packed. |
| * |
| * @param o, a, w, n offset, alignment, width, # of blocks as usual |
| * @param area pointer to store area needed for packing |
| * @param p pointer to store packing coordinates |
| * @return number of blocks that can be packed |
| */ |
| |
| /* Method A: progressive packing: AAAAaaaaBBbbCc into 64-slot area */ |
| #define MAX_A 21 |
| static int nv12_A(u16 o, u16 a, u16 w, u16 n, u16 *area, u8 *p) |
| { |
| u16 x = o, u, l, m = 0; |
| *area = band_8; |
| |
| while (x + w < *area && m < n) { |
| /* current 8bit upper bound (a) is next 8bit lower bound (B) */ |
| l = u = (*area + x) >> 1; |
| |
| /* pack until upper bound */ |
| while (x + w <= u && m < n) { |
| /* save packing */ |
| BUG_ON(m + 1 >= MAX_A); |
| *p++ = x; |
| *p++ = l; |
| l = (*area + x + w + 1) >> 1; |
| x = ALIGN(x + w - o, a) + o; |
| m++; |
| } |
| x = ALIGN(l - o, a) + o; /* set new lower bound */ |
| } |
| return m; |
| } |
| |
| /* Method -A: regressive packing: cCbbBBaaaaAAAA into 64-slot area */ |
| static int nv12_revA(u16 o, u16 a, u16 w, u16 n, u16 *area, u8 *p) |
| { |
| u16 m; |
| |
| /* this is a mirrored packing of method A */ |
| n = nv12_A((a - (o + w) % a) % a, a, w, n, area, p); |
| |
| /* reverse packing */ |
| for (m = 0; m < n; m++) { |
| *p = *area - *p - w; |
| p++; |
| *p = *area - *p - ((w + 1) >> 1); |
| p++; |
| } |
| return n; |
| } |
| |
| /* Method B: simple layout: aAbcBdeCfgDhEFGH */ |
| #define MAX_B 8 |
| static int nv12_B(u16 o, u16 a, u16 w, u16 n, u16 *area, u8 *p) |
| { |
| u16 e = (o + w) % a; /* end offset */ |
| u16 o1 = (o >> 1) % a; /* half offset */ |
| u16 e1 = ((o + w + 1) >> 1) % a; /* half end offset */ |
| u16 o2 = o1 + (a >> 2); /* 2nd half offset */ |
| u16 e2 = e1 + (a >> 2); /* 2nd half end offset */ |
| u16 m = 0; |
| *area = band_8; |
| |
| /* ensure 16-bit blocks don't overlap 8-bit blocks */ |
| |
| /* width cannot wrap around alignment, half block must be before block, |
| 2nd half can be before or after */ |
| if (w < a && o < e && e1 <= o && (e2 <= o || o2 >= e)) |
| while (o + w <= *area && m < n) { |
| BUG_ON(m + 1 >= MAX_B); |
| *p++ = o; |
| *p++ = o >> 1; |
| m++; |
| o += a; |
| } |
| return m; |
| } |
| |
| /* Method C: butterfly layout: AAbbaaBB */ |
| #define MAX_C 20 |
| static int nv12_C(u16 o, u16 a, u16 w, u16 n, u16 *area, u8 *p) |
| { |
| int m = 0; |
| u16 o2, e = ALIGN(w, a), i = 0, j = 0; |
| *area = band_8; |
| o2 = *area - (a - (o + w) % a) % a; /* end of last possible block */ |
| |
| m = (min(o2 - 2 * o, 2 * o2 - o - *area) / 3 - w) / e + 1; |
| for (i = j = 0; i < m && j < n; i++, j++) { |
| BUG_ON(j + 1 >= MAX_C); |
| *p++ = o + i * e; |
| *p++ = (o + i * e + *area) >> 1; |
| if (++j < n) { |
| *p++ = o2 - i * e - w; |
| *p++ = (o2 - i * e - w) >> 1; |
| } |
| } |
| return j; |
| } |
| |
| /* Method D: for large allocation: aA or Aa */ |
| #define MAX_D 1 |
| static int nv12_D(u16 o, u16 a, u16 w, u16 n, u16 *area, u8 *p) |
| { |
| u16 o1, w1 = (w + 1) >> 1, d; |
| *area = ALIGN(o + w, band_8); |
| |
| for (d = 0; n > 0 && d + o + w <= *area; d += a) { |
| /* try to fit 16-bit before 8-bit */ |
| o1 = ((o + d) % band_8) >> 1; |
| if (o1 + w1 <= o + d) { |
| *p++ = o + d; |
| *p++ = o1; |
| return 1; |
| } |
| |
| /* try to fit 16-bit after 8-bit */ |
| o1 += ALIGN(d + o + w - o1, band_16); |
| if (o1 + w1 <= *area) { |
| *p++ = o; |
| *p++ = o1; |
| return 1; |
| } |
| } |
| return 0; |
| } |
| |
| /** |
| * Umbrella nv12 packing method. This selects the best packings from the above |
| * methods. It also contains hardcoded packings for parameter combinations |
| * that have more efficient packings. This method provides is guaranteed to |
| * provide the optimal packing if 2 <= a <= 64 and w <= 64 and n is large. |
| */ |
| #define MAX_ANY 21 /* must be MAX(method-MAX-s, hardcoded n-s) */ |
| static u16 nv12_together(u16 o, u16 a, u16 w, u16 n, u16 *area, u8 *packing) |
| { |
| u16 n_best, a_best, n2, a_, o_, w_; |
| |
| /* algo results (packings) */ |
| u8 pack_A[MAX_A * 2], pack_rA[MAX_A * 2]; |
| u8 pack_B[MAX_B * 2], pack_C[MAX_C * 2]; |
| u8 pack_D[MAX_D * 2]; |
| |
| /* |
| * Hardcoded packings. They are sorted by increasing area, and then by |
| * decreasing n. We may not get the best efficiency if less than n |
| * blocks are needed as packings are not necessarily sorted in |
| * increasing order. However, for those n-s one of the other 4 methods |
| * may return the optimal packing. |
| */ |
| u8 packings[] = { |
| /* n=9, o=2, w=4, a=4, area=64 */ |
| 9, 2, 4, 4, 64, |
| /* 8-bit, 16-bit block coordinate pairs */ |
| 2, 33, 6, 35, 10, 37, 14, 39, 18, 41, |
| 46, 23, 50, 25, 54, 27, 58, 29, |
| /* o=0, w=12, a=4, n=3 */ |
| 3, 0, 12, 4, 64, |
| 0, 32, 12, 38, 48, 24, |
| /* end */ |
| 0 |
| }, *p = packings, *p_best = NULL, *p_end; |
| p_end = packings + sizeof(packings) - 1; |
| |
| /* see which method gives the best packing */ |
| |
| /* start with smallest area algorithms A, B & C, stop if we can |
| pack all buffers */ |
| n_best = nv12_A(o, a, w, n, area, pack_A); |
| p_best = pack_A; |
| if (n_best < n) { |
| n2 = nv12_revA(o, a, w, n, &a_best, pack_rA); |
| if (n2 > n_best) { |
| n_best = n2; |
| p_best = pack_rA; |
| *area = a_best; |
| } |
| } |
| if (n_best < n) { |
| n2 = nv12_B(o, a, w, n, &a_best, pack_B); |
| if (n2 > n_best) { |
| n_best = n2; |
| p_best = pack_B; |
| *area = a_best; |
| } |
| } |
| if (n_best < n) { |
| n2 = nv12_C(o, a, w, n, &a_best, pack_C); |
| if (n2 > n_best) { |
| n_best = n2; |
| p_best = pack_C; |
| *area = a_best; |
| } |
| } |
| |
| /* traverse any special packings */ |
| while (*p) { |
| n2 = *p++; |
| o_ = *p++; |
| w_ = *p++; |
| a_ = *p++; |
| /* stop if we already have a better packing */ |
| if (n2 < n_best) |
| break; |
| |
| /* check if this packing is satisfactory */ |
| if (a_ >= a && o + w + ALIGN(o_ - o, a) <= o_ + w_) { |
| *area = *p++; |
| n_best = min(n2, n); |
| p_best = p; |
| break; |
| } |
| |
| /* skip to next packing */ |
| p += 1 + n2 * 2; |
| } |
| |
| /* |
| * If so far unsuccessful, check whether 8 and 16 bit blocks can be |
| * co-packed. This will actually be done in the end by the normal |
| * allocation, but we need to reserve a big-enough area. |
| */ |
| if (!n_best) { |
| n_best = nv12_D(o, a, w, n, area, pack_D); |
| p_best = NULL; |
| } |
| |
| /* store best packing */ |
| if (p_best && n_best) { |
| BUG_ON(n_best > MAX_ANY); |
| memcpy(packing, p_best, n_best * 2 * sizeof(*pack_A)); |
| } |
| |
| return n_best; |
| } |
| |
| /* reserve nv12 blocks */ |
| static void reserve_nv12(u32 n, u32 width, u32 height, |
| u32 gid, struct process_info *pi) |
| { |
| u16 w, h, band, a, o = 0; |
| struct gid_info *gi; |
| int res = 0, res2, i; |
| u16 n_t, n_s, area_t, area_s; |
| u8 packing[2 * MAX_ANY]; |
| struct list_head reserved = LIST_HEAD_INIT(reserved); |
| |
| /* Check input parameters for correctness, and support */ |
| if (!width || !height || !n || |
| n > ops->width * ops->height / 2) |
| return; |
| |
| /* calculate dimensions, band, and alignment in slots */ |
| if (ops->analize(TILFMT_8BIT, width, height, &w, &h, &band, &a)) |
| return; |
| |
| /* get group context */ |
| gi = ops->get_gi(pi, gid); |
| if (!gi) |
| return; |
| |
| /* reserve in groups until failed or all is reserved */ |
| for (i = 0; i < n && res >= 0; i += res) { |
| /* check packing separately vs together */ |
| n_s = nv12_separate(o, a, w, n - i, &area_s); |
| if (ops->nv12_packed) |
| n_t = nv12_together(o, a, w, n - i, &area_t, packing); |
| else |
| n_t = 0; |
| |
| /* pack based on better efficiency */ |
| res = -1; |
| if (!ops->nv12_packed || |
| nv12_eff(w, n_s, area_s, n - i) > |
| nv12_eff(w, n_t, area_t, n - i)) { |
| |
| /* |
| * Reserve blocks separately into a temporary list, so |
| * that we can free them if unsuccessful. We need to be |
| * able to reserve both 8- and 16-bit blocks as the |
| * offsets of them must match. |
| */ |
| res = ops->lay_2d(TILFMT_8BIT, n_s, w, h, band_8, a, |
| gi, &reserved); |
| res2 = ops->lay_2d(TILFMT_16BIT, n_s, (w + 1) >> 1, h, |
| band_16, a >> 1, gi, &reserved); |
| |
| if (res2 < 0 || res < 0 || res != res2) { |
| /* clean up */ |
| ops->release(&reserved); |
| res = -1; |
| } else { |
| /* add list to reserved */ |
| ops->add_reserved(&reserved, gi); |
| } |
| } |
| |
| /* if separate packing failed, still try to pack together */ |
| if (res < 0 && ops->nv12_packed && n_t) { |
| /* pack together */ |
| res = ops->lay_nv12(n_t, area_t, w, h, gi, packing); |
| } |
| } |
| |
| ops->release_gi(gi); |
| } |
| |
| /* initialize shared method pointers and global static variables */ |
| void tiler_nv12_init(struct tiler_ops *tiler) |
| { |
| ops = tiler; |
| |
| ops->reserve_nv12 = reserve_nv12; |
| |
| band_8 = PAGE_SIZE / ops->geom(TILFMT_8BIT)->slot_w |
| / ops->geom(TILFMT_8BIT)->bpp; |
| band_16 = PAGE_SIZE / ops->geom(TILFMT_16BIT)->slot_w |
| / ops->geom(TILFMT_16BIT)->bpp; |
| } |