/*
 * drivers/video/tegra/dc/bandwidth.c
 *
 * Copyright (c) 2010-2013, NVIDIA CORPORATION, All rights reserved.
 *
 * Author: Jon Mayo <jmayo@nvidia.com>
 *
 * This software is licensed under the terms of the GNU General Public
 * License version 2, as published by the Free Software Foundation, and
 * may be copied, distributed, and modified under those terms.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 */

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/clk.h>
#include <linux/clk-provider.h>
#include <trace/events/display.h>
#include <linux/platform_data/tegra_emc.h>

#include "dc_reg.h"
#include "dc_config.h"
#include "dc_priv.h"

/* Need to set higher EMC than normal usage in case of latency */
#define EMC_BW_USAGE_CUTOFF		2041 /* (100 / 49) * 1000 */
#define EMC_FREQ_CUTOFF_USE_130_PERCENT	100000000
#define EMC_FREQ_CUTOFF_USE_140_PERCENT	50000000

static int use_dynamic_emc = 1;

module_param_named(use_dynamic_emc, use_dynamic_emc, int, S_IRUGO | S_IWUSR);

static int tegra_dc_windows_is_overlapped(struct tegra_dc_win *a,
	struct tegra_dc_win *b)
{
	if (a == b)
		return 0;

	if (!WIN_IS_ENABLED(a) || !WIN_IS_ENABLED(b))
		return 0;

	/* because memory access to load the fifo can overlap, only care
	 * if windows overlap vertically */
	return ((a->out_y + a->out_h > b->out_y) && (a->out_y <= b->out_y)) ||
		((b->out_y + b->out_h > a->out_y) && (b->out_y <= a->out_y));
}

/* check overlapping window combinations to find the max bandwidth. */
static unsigned long tegra_dc_find_max_bandwidth(struct tegra_dc_win *wins[],
						 unsigned n)
{
	unsigned i;
	unsigned j;
	unsigned long bw;
	unsigned long max = 0;

	for (i = 0; i < n; i++) {
		bw = wins[i]->new_bandwidth;
		for (j = 0; j < n; j++)
			if (tegra_dc_windows_is_overlapped(wins[i], wins[j]))
				bw += wins[j]->new_bandwidth;
		if (max < bw)
			max = bw;
	}
	return max;
}

/*
 * Calculate peak EMC bandwidth for each enabled window =
 * pixel_clock * win_bpp * (use_v_filter ? 2 : 1)) * H_scale_factor *
 * (windows_tiling ? 2 : 1)
 *
 * note:
 * (*) We use 2 tap V filter on T2x/T3x, so need double BW if use V filter
 * (*) Tiling mode on T30 and DDR3 requires double BW
 *
 * return:
 * bandwidth in kBps
 */
static unsigned long tegra_dc_calc_win_bandwidth(struct tegra_dc *dc,
	struct tegra_dc_win *w)
{
	u64 ret;
	int tiled_windows_bw_multiplier;
	unsigned long bpp;
	unsigned in_w;

	if (!WIN_IS_ENABLED(w))
		return 0;

	if (dfixed_trunc(w->w) == 0 || dfixed_trunc(w->h) == 0 ||
	    w->out_w == 0 || w->out_h == 0)
		return 0;
	if (w->flags & TEGRA_WIN_FLAG_SCAN_COLUMN)
		/* rotated: PRESCALE_SIZE swapped, but WIN_SIZE is unchanged */
		in_w = dfixed_trunc(w->h);
	else
		in_w = dfixed_trunc(w->w); /* normal output, not rotated */

	/* FIXME: Do this when MC driver is ready. */
	tiled_windows_bw_multiplier = 2;

	/* all of tegra's YUV formats(420 and 422) fetch 2 bytes per pixel,
	 * but the size reported by tegra_dc_fmt_bpp for the planar version
	 * is of the luma plane's size only. */
	bpp = tegra_dc_is_yuv_planar(w->fmt) ?
		2 * tegra_dc_fmt_bpp(w->fmt) : tegra_dc_fmt_bpp(w->fmt);

	/* brackets are used to avoid compiler rearranging expression such
	   that the intermediate result may overflow 32bits */
	ret = (dc->mode.pclk / 1000UL) * (bpp / 8) *
		in_w * (WIN_IS_TILED(w) ?
		tiled_windows_bw_multiplier : 1);
	ret = div_u64(ret, w->out_w);

	return ret;
}

static unsigned long tegra_dc_get_bandwidth(
	struct tegra_dc_win *windows[], int n)
{
	int i;

	BUG_ON(n > windows[0]->dc->n_windows);

	/* emc rate and latency allowance both need to know per window
	 * bandwidths */
	for (i = 0; i < n; i++) {
		struct tegra_dc_win *w = windows[i];

		if (w)
			w->new_bandwidth =
				tegra_dc_calc_win_bandwidth(w->dc, w);
	}

	return tegra_dc_find_max_bandwidth(windows, n);
}

/* to save power, call when display memory clients would be idle */
void tegra_dc_clear_bandwidth(struct tegra_dc *dc)
{
	trace_clear_bandwidth(dc);
	if (__clk_get_enable_count(dc->emc_clk))
		clk_disable_unprepare(dc->emc_clk);
	dc->bw_kbps = 0;
}

/* bw in kByte/second. returns Hz for EMC frequency */
static inline unsigned long tegra_dc_kbps_to_emc(struct tegra_dc *dc,
						 unsigned long bw)
{
	struct clk *emc_master;
	unsigned long freq, old_freq;

	emc_master = clk_get_parent(dc->emc_clk);
	if (bw == ULONG_MAX)
		return clk_round_rate(emc_master, ULONG_MAX);

	freq = tegra_emc_bw_to_freq_req(bw);
	/* freq too big - clamp at max */
	if (freq >= (ULONG_MAX / 1000))
		return clk_round_rate(emc_master, ULONG_MAX);

	/* should never occur because of above */
	if (WARN_ONCE((freq * 1000) < freq, "Bandwidth Overflow"))
		return clk_round_rate(emc_master, ULONG_MAX);

	freq *= 1000;
	freq = clk_round_rate(emc_master, freq);
	/*
	 * Ensure that the normal bw used by the display is no more than 49%
	 * of the total bandwidth set since we will need enough in our
	 * display FIFOs to survive any latency (due to DVFS freq change, etc.)
	 * and we need to fill up the FIFOs before the next latency.
	 */
	while (tegra_emc_freq_req_to_bw(freq) < EMC_BW_USAGE_CUTOFF * bw) {
		old_freq = freq;
		freq = clk_round_rate(emc_master, freq + 1);
		if (old_freq == freq)
			return freq;
	}
	/* Depending on frequency value, the amount of bandwidth usage % of
	 * total we should use is different. Thus we should request a multiple of
	 * original bandwidth on this.  Use 1.4 for < 50MHz, 1.3 for < 100MHz,
	 * else 1.1 */
	if (freq < EMC_FREQ_CUTOFF_USE_140_PERCENT)
		bw += 4 * bw / 10; /* 1.4 */
	else if (freq < EMC_FREQ_CUTOFF_USE_130_PERCENT)
		bw += 3 * bw / 10; /* 1.3 */
	else
		bw += bw / 10; /* 1.1 */
	freq = tegra_emc_bw_to_freq_req(bw);
	freq *= 1000;
	freq = clk_round_rate(emc_master, freq);
	/* Again ensure the bw used is no more than the cutoff */
	while (tegra_emc_freq_req_to_bw(freq) < EMC_BW_USAGE_CUTOFF * bw) {
		freq = clk_round_rate(emc_master, freq + 1);
	}
	return freq;
}

/* use the larger of dc->bw_kbps or dc->new_bw_kbps, and copies
 * dc->new_bw_kbps into dc->bw_kbps.
 * calling this function both before and after a flip is sufficient to select
 * the best possible frequency and latency allowance.
 * set use_new to true to force dc->new_bw_kbps programming.
 */
void tegra_dc_program_bandwidth(struct tegra_dc *dc, bool use_new)
{
	unsigned i;

	if (use_new || dc->bw_kbps != dc->new_bw_kbps) {
		unsigned long bw = max(dc->bw_kbps, dc->new_bw_kbps);
		unsigned long emc_freq;

		/* going from 0 to non-zero */
		if (!dc->bw_kbps && dc->new_bw_kbps &&
			!__clk_get_enable_count(dc->emc_clk))
			clk_prepare_enable(dc->emc_clk);

		emc_freq = tegra_dc_kbps_to_emc(dc, bw);
		clk_set_rate(dc->emc_clk, emc_freq);

		/* disabling dc->emc_clk if both of dc->bw_kbps
		 * and dc->new_bw_kpbs are set to 0, and
		 * dc->vblank_ref_count is 0
		 */
		if (!dc->bw_kbps && !dc->new_bw_kbps &&
			!dc->vblank_ref_count &&
			__clk_get_enable_count(dc->emc_clk))
			clk_disable_unprepare(dc->emc_clk);

		dc->bw_kbps = dc->new_bw_kbps;
	}

	for (i = 0; i < dc->n_windows; i++) {
		struct tegra_dc_win *w = &dc->windows[i];

		/* TODO: Notify MC our new latency allowance. */

		trace_program_bandwidth(dc);
		w->bandwidth = w->new_bandwidth;
	}
}

int tegra_dc_set_dynamic_emc(struct tegra_dc_win *windows[], int n)
{
	unsigned long new_rate;
	struct tegra_dc *dc;

	if (!use_dynamic_emc)
		return 0;

	dc = windows[0]->dc;

	if (tegra_dc_has_multiple_dc())
		new_rate = ULONG_MAX;
	else
		new_rate = tegra_dc_get_bandwidth(windows, n);

	dc->new_bw_kbps = new_rate;
	trace_set_dynamic_emc(dc);

	return 0;
}

/* return the minimum bandwidth in kbps for display to function */
long tegra_dc_calc_min_bandwidth(struct tegra_dc *dc)
{
	unsigned pclk = tegra_dc_get_out_max_pixclock(dc);

	if (WARN_ONCE(!dc, "dc is NULL") ||
		WARN_ONCE(!dc->out, "dc->out is NULL!"))
		return 0;

	if (!pclk && dc->out->type == TEGRA_DC_OUT_HDMI) {
		pclk = tegra_dc_get_out_max_pixclock(dc);
		if (!pclk) {
			if (is_tegra114())
				pclk = 300000000; /* 300MHz max */
			else
				pclk = 150000000; /* 150MHz max */
		}
	} else {
		pclk = dc->mode.pclk;
	}
	return pclk / 1000 * 4; /* support a single 32bpp window */
}