/*
 * drivers/video/tegra/host/gk20a/gk20a.c
 *
 * GK20A Graphics
 *
 * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
 * version 2, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 * more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <linux/highmem.h>
#include <linux/cdev.h>
#include <linux/delay.h>
#include <linux/firmware.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/export.h>
#include <linux/of.h>
#include <linux/of_device.h>
#include <linux/of_platform.h>
#include <linux/thermal.h>
#include <asm/cacheflush.h>
#include <linux/debugfs.h>
#include <linux/spinlock.h>
#include <linux/tegra-powergate.h>

#include <linux/suspend.h>
#include <linux/sched.h>

#include <linux/platform_data/tegra_pm_domains.h>

#include "dev.h"
#include "class_ids.h"
#include "bus_client.h"
#include "nvhost_as.h"

#include "gk20a.h"
#include "ctrl_gk20a.h"
#include "hw_mc_gk20a.h"
#include "hw_timer_gk20a.h"
#include "hw_bus_gk20a.h"
#include "hw_sim_gk20a.h"
#include "gk20a_power.h"
#include "gk20a_scale.h"
#include "gr3d/pod_scaling.h"

#ifdef CONFIG_TEGRA_GK20A_DEBUG_SESSION
#include "dbg_gpu_gk20a.h"
#endif

static inline void set_gk20a(struct platform_device *dev, struct gk20a *gk20a)
{
	nvhost_set_private_data(dev, gk20a);
}

/* TBD: should be able to put in the list below. */
static struct resource gk20a_intr = {
	.start = TEGRA_GK20A_INTR,
	.end   = TEGRA_GK20A_INTR_NONSTALL,
	.flags = IORESOURCE_IRQ,
};

struct resource gk20a_resources_sim[] = {
	{
	.start = TEGRA_GK20A_BAR0_BASE,
	.end   = TEGRA_GK20A_BAR0_BASE + TEGRA_GK20A_BAR0_SIZE - 1,
	.flags = IORESOURCE_MEM,
	},
	{
	.start = TEGRA_GK20A_BAR1_BASE,
	.end   = TEGRA_GK20A_BAR1_BASE + TEGRA_GK20A_BAR1_SIZE - 1,
	.flags = IORESOURCE_MEM,
	},
	{
	.start = TEGRA_GK20A_SIM_BASE,
	.end   = TEGRA_GK20A_SIM_BASE + TEGRA_GK20A_SIM_SIZE - 1,
	.flags = IORESOURCE_MEM,
	},
};

const struct file_operations tegra_gk20a_ctrl_ops = {
	.owner = THIS_MODULE,
	.release = gk20a_ctrl_dev_release,
	.open = gk20a_ctrl_dev_open,
	.unlocked_ioctl = gk20a_ctrl_dev_ioctl,
};

#ifdef CONFIG_TEGRA_GK20A_DEBUG_SESSION
const struct file_operations tegra_gk20a_dbg_gpu_ops = {
	.owner = THIS_MODULE,
	.release        = gk20a_dbg_gpu_dev_release,
	.open           = gk20a_dbg_gpu_dev_open,
	.unlocked_ioctl = gk20a_dbg_gpu_dev_ioctl,
	.poll		= gk20a_dbg_gpu_dev_poll,
#ifdef CONFIG_COMPAT
	.compat_ioctl = gk20a_dbg_gpu_dev_ioctl,
#endif
};

/*
 * Note: We use a different 'open' to trigger handling of the profiler session.
 * Most of the code is shared between them...  Though, at some point if the
 * code does get too tangled trying to handle each in the same path we can
 * separate them cleanly.
 */
const struct file_operations tegra_gk20a_prof_gpu_ops = {
	.owner = THIS_MODULE,
	.release        = gk20a_dbg_gpu_dev_release,
	.open           = gk20a_prof_gpu_dev_open,
	.unlocked_ioctl = gk20a_dbg_gpu_dev_ioctl,
	/* .mmap           = gk20a_prof_gpu_dev_mmap,*/
	/*int (*mmap) (struct file *, struct vm_area_struct *);*/
	.compat_ioctl = gk20a_dbg_gpu_dev_ioctl,
#ifdef CONFIG_COMPAT
	.compat_ioctl = gk20a_dbg_gpu_dev_ioctl,
#endif
};
#endif

static inline void sim_writel(struct gk20a *g, u32 r, u32 v)
{
	writel(v, g->sim.regs+r);
}

static inline u32 sim_readl(struct gk20a *g, u32 r)
{
	return readl(g->sim.regs+r);
}

static inline u32 sim_msg_header_size(void)
{
	return 24;/*TBD: fix the header to gt this from NV_VGPU_MSG_HEADER*/
}

static inline u32 *sim_msg_bfr(struct gk20a *g, u32 byte_offset)
{
	return (u32 *)(g->sim.msg_bfr.kvaddr + byte_offset);
}

static inline u32 *sim_msg_hdr(struct gk20a *g, u32 byte_offset)
{
	return sim_msg_bfr(g, byte_offset); /*starts at 0*/
}

static inline u32 *sim_msg_param(struct gk20a *g, u32 byte_offset)
{
	/*starts after msg header/cmn*/
	return sim_msg_bfr(g, byte_offset + sim_msg_header_size());
}

static inline void sim_write_hdr(struct gk20a *g, u32 func, u32 size)
{
	/*memset(g->sim.msg_bfr.kvaddr,0,min(PAGE_SIZE,size));*/
	*sim_msg_hdr(g, sim_msg_signature_r()) = sim_msg_signature_valid_v();
	*sim_msg_hdr(g, sim_msg_result_r())    = sim_msg_result_rpc_pending_v();
	*sim_msg_hdr(g, sim_msg_spare_r())     = sim_msg_spare__init_v();
	*sim_msg_hdr(g, sim_msg_function_r())  = func;
	*sim_msg_hdr(g, sim_msg_length_r())    = size + sim_msg_header_size();
}

static inline u32 sim_escape_read_hdr_size(void)
{
	return 12; /*TBD: fix NV_VGPU_SIM_ESCAPE_READ_HEADER*/
}

static u32 *sim_send_ring_bfr(struct gk20a *g, u32 byte_offset)
{
	return (u32 *)(g->sim.send_bfr.kvaddr + byte_offset);
}

static int rpc_send_message(struct gk20a *g)
{
	/* calculations done in units of u32s */
	u32 send_base = sim_send_put_pointer_v(g->sim.send_ring_put) * 2;
	u32 dma_offset = send_base + sim_dma_r()/sizeof(u32);
	u32 dma_hi_offset = send_base + sim_dma_hi_r()/sizeof(u32);

	*sim_send_ring_bfr(g, dma_offset*sizeof(u32)) =
		sim_dma_target_phys_pci_coherent_f() |
		sim_dma_status_valid_f() |
		sim_dma_size_4kb_f() |
		sim_dma_addr_lo_f(g->sim.msg_bfr.phys >> PAGE_SHIFT);

	*sim_send_ring_bfr(g, dma_hi_offset*sizeof(u32)) = 0; /*TBD >32b phys*/

	*sim_msg_hdr(g, sim_msg_sequence_r()) = g->sim.sequence_base++;

	g->sim.send_ring_put = (g->sim.send_ring_put + 2 * sizeof(u32)) %
		PAGE_SIZE;

	__cpuc_flush_dcache_area(g->sim.msg_bfr.kvaddr, PAGE_SIZE);
	__cpuc_flush_dcache_area(g->sim.send_bfr.kvaddr, PAGE_SIZE);
	__cpuc_flush_dcache_area(g->sim.recv_bfr.kvaddr, PAGE_SIZE);

	/* Update the put pointer. This will trap into the host. */
	sim_writel(g, sim_send_put_r(), g->sim.send_ring_put);

	return 0;
}

static inline u32 *sim_recv_ring_bfr(struct gk20a *g, u32 byte_offset)
{
	return (u32 *)(g->sim.recv_bfr.kvaddr + byte_offset);
}

static int rpc_recv_poll(struct gk20a *g)
{
	phys_addr_t recv_phys_addr;

	/* XXX This read is not required (?) */
	/*pVGpu->recv_ring_get = VGPU_REG_RD32(pGpu, NV_VGPU_RECV_GET);*/

	/* Poll the recv ring get pointer in an infinite loop*/
	do {
		g->sim.recv_ring_put = sim_readl(g, sim_recv_put_r());
	} while (g->sim.recv_ring_put == g->sim.recv_ring_get);

	/* process all replies */
	while (g->sim.recv_ring_put != g->sim.recv_ring_get) {
		/* these are in u32 offsets*/
		u32 dma_lo_offset =
			sim_recv_put_pointer_v(g->sim.recv_ring_get)*2 + 0;
		/*u32 dma_hi_offset = dma_lo_offset + 1;*/
		u32 recv_phys_addr_lo =
			sim_dma_addr_lo_v(*sim_recv_ring_bfr(g,
							     dma_lo_offset*4));

		/*u32 recv_phys_addr_hi = sim_dma_hi_addr_v(
		      (phys_addr_t)sim_recv_ring_bfr(g,dma_hi_offset*4));*/

		/*TBD >32b phys addr */
		recv_phys_addr = recv_phys_addr_lo << PAGE_SHIFT;

		if (recv_phys_addr != g->sim.msg_bfr.phys) {
			dev_err(dev_from_gk20a(g), "%s Error in RPC reply\n",
				__func__);
			return -1;
		}

		/* Update GET pointer */
		g->sim.recv_ring_get = (g->sim.recv_ring_get + 2*sizeof(u32)) %
			PAGE_SIZE;

		__cpuc_flush_dcache_area(g->sim.msg_bfr.kvaddr, PAGE_SIZE);
		__cpuc_flush_dcache_area(g->sim.send_bfr.kvaddr, PAGE_SIZE);
		__cpuc_flush_dcache_area(g->sim.recv_bfr.kvaddr, PAGE_SIZE);

		sim_writel(g, sim_recv_get_r(), g->sim.recv_ring_get);

		g->sim.recv_ring_put = sim_readl(g, sim_recv_put_r());
	}

	return 0;
}

static int issue_rpc_and_wait(struct gk20a *g)
{
	int err;

	err = rpc_send_message(g);
	if (err) {
		dev_err(dev_from_gk20a(g), "%s failed rpc_send_message\n",
			__func__);
		return err;
	}

	err = rpc_recv_poll(g);
	if (err) {
		dev_err(dev_from_gk20a(g), "%s failed rpc_recv_poll\n",
			__func__);
		return err;
	}

	/* Now check if RPC really succeeded */
	if (*sim_msg_hdr(g, sim_msg_result_r()) != sim_msg_result_success_v()) {
		dev_err(dev_from_gk20a(g), "%s received failed status!\n",
			__func__);
		return -(*sim_msg_hdr(g, sim_msg_result_r()));
	}
	return 0;
}

int gk20a_sim_esc_read(struct gk20a *g, char *path, u32 index, u32 count,
		       u32 *data)
{
	int err;
	size_t pathlen = strlen(path);
	u32 data_offset;

	sim_write_hdr(g, sim_msg_function_sim_escape_read_v(),
		      sim_escape_read_hdr_size());
	*sim_msg_param(g, 0) = index;
	*sim_msg_param(g, 4) = count;
	data_offset = roundup(0xc +  pathlen + 1, sizeof(u32));
	*sim_msg_param(g, 8) = data_offset;
	strcpy((char *)sim_msg_param(g, 0xc), path);

	err = issue_rpc_and_wait(g);

	if (!err)
		memcpy(data, sim_msg_param(g, data_offset), count);
	return err;
}

static irqreturn_t gk20a_intr_isr(int irq, void *dev_id)
{
	struct gk20a *g = dev_id;
	u32 mc_intr_0;

	if (!g->power_on)
		return IRQ_NONE;

	/* not from gpu when sharing irq with others */
	mc_intr_0 = gk20a_readl(g, mc_intr_0_r());
	if (unlikely(!mc_intr_0))
		return IRQ_NONE;

	gk20a_writel(g, mc_intr_en_0_r(),
		mc_intr_en_0_inta_disabled_f());

	/* flush previous write */
	gk20a_readl(g, mc_intr_en_0_r());

	return IRQ_WAKE_THREAD;
}

static void gk20a_pbus_isr(struct gk20a *g)
{
	u32 val;
	val = gk20a_readl(g, bus_intr_0_r());
	if (val & (bus_intr_0_pri_squash_m() |
			bus_intr_0_pri_fecserr_m() |
			bus_intr_0_pri_timeout_m())) {
		nvhost_err(&g->dev->dev,
			"NV_PTIMER_PRI_TIMEOUT_SAVE_0: 0x%x\n",
			gk20a_readl(g, timer_pri_timeout_save_0_r()));
		nvhost_err(&g->dev->dev,
			"NV_PTIMER_PRI_TIMEOUT_SAVE_1: 0x%x\n",
			gk20a_readl(g, timer_pri_timeout_save_1_r()));
		nvhost_err(&g->dev->dev,
			"NV_PTIMER_PRI_TIMEOUT_FECS_ERRCODE: 0x%x\n",
			gk20a_readl(g, timer_pri_timeout_fecs_errcode_r()));
	}

	if (val)
		nvhost_err(&g->dev->dev,
			"Unhandled pending pbus interrupt\n");

	gk20a_writel(g, bus_intr_0_r(), val);
}

static irqreturn_t gk20a_intr_thread(int irq, void *dev_id)
{
	struct gk20a *g = dev_id;
	u32 mc_intr_0;

	nvhost_dbg(dbg_intr, "interrupt thread launched");

	mc_intr_0 = gk20a_readl(g, mc_intr_0_r());

	if (mc_intr_0 & mc_intr_0_pgraph_pending_f())
		gr_gk20a_elpg_protected_call(g, gk20a_gr_isr(g));
	if (mc_intr_0 & mc_intr_0_pfifo_pending_f())
		gk20a_fifo_isr(g);
	if (mc_intr_0 & mc_intr_0_pmu_pending_f())
		gk20a_pmu_isr(g);
	if (mc_intr_0 & mc_intr_0_priv_ring_pending_f())
		gk20a_priv_ring_isr(g);
	if (mc_intr_0 & mc_intr_0_ltc_pending_f())
		gk20a_mm_ltc_isr(g);
	if (mc_intr_0 & mc_intr_0_pbus_pending_f())
		gk20a_pbus_isr(g);
	if (mc_intr_0)
		nvhost_dbg_info("leaving isr with interrupt pending 0x%08x",
				mc_intr_0);

	gk20a_writel(g, mc_intr_en_0_r(),
		mc_intr_en_0_inta_hardware_f());

	/* flush previous write */
	gk20a_readl(g, mc_intr_en_0_r());

	return IRQ_HANDLED;
}

static void gk20a_remove_support(struct platform_device *dev)
{
	struct gk20a *g = get_gk20a(dev);

	if (g->pmu.remove_support)
		g->pmu.remove_support(&g->pmu);

	if (g->gk20a_cdev.gk20a_cooling_dev)
		thermal_cooling_device_unregister(
			g->gk20a_cdev.gk20a_cooling_dev);

	if (g->gr.remove_support)
		g->gr.remove_support(&g->gr);

	if (g->fifo.remove_support)
		g->fifo.remove_support(&g->fifo);

	if (g->mm.remove_support)
		g->mm.remove_support(&g->mm);

	if (g->sim.remove_support)
		g->sim.remove_support(&g->sim);

	release_firmware(g->pmu_fw);

	if (g->irq_requested) {
		free_irq(gk20a_intr.start, g);
		g->irq_requested = false;
	}

	/* free mappings to registers, etc*/

	if (g->regs) {
		iounmap(g->regs);
		g->regs = 0;
	}
}

int nvhost_init_gk20a_support(struct platform_device *dev)
{
	int err = 0;
	struct gk20a *g = get_gk20a(dev);
	struct nvhost_device_data *pdata = nvhost_get_devdata(dev);

	g->regs = pdata->aperture[GK20A_BAR0_IORESOURCE_MEM];
	if (!g->regs) {
		dev_err(dev_from_gk20a(g), "failed to remap gk20a registers\n");
		err = -ENXIO;
		goto fail;
	}

	g->bar1 = pdata->aperture[GK20A_BAR1_IORESOURCE_MEM];
	if (!g->bar1) {
		dev_err(dev_from_gk20a(g), "failed to remap gk20a bar1\n");
		err = -ENXIO;
		goto fail;
	}

	mutex_init(&g->dbg_sessions_lock);

	err = gk20a_init_clk_gpcpll(g);
	if (err)
		goto fail;

	/* other inits are deferred until gpu is powered up. */

	g->remove_support = gk20a_remove_support;
	return 0;

 fail:
	gk20a_remove_support(dev);
	return err;
}

int nvhost_gk20a_init(struct platform_device *dev)
{
	struct gk20a *g = get_gk20a(dev);
	int err;

	nvhost_dbg_fn("");

#ifndef CONFIG_PM_RUNTIME
	nvhost_gk20a_finalize_poweron(dev);
#endif

	/*
	 * nvhost_as alloc_share can be called before gk20a is powered on.
	 * It requires mm sw states configured so init mm sw early here.
	 */
	err = gk20a_init_mm_setup_sw(g);
	if (err)
		return err;

	if (IS_ENABLED(CONFIG_TEGRA_GK20A_DEVFREQ))
		nvhost_gk20a_scale_hw_init(dev);
	return 0;
}

void nvhost_gk20a_deinit(struct platform_device *dev)
{
	nvhost_dbg_fn("");
#ifndef CONFIG_PM_RUNTIME
	nvhost_gk20a_prepare_poweroff(dev);
#endif
}

static void gk20a_free_hwctx(struct kref *ref)
{
	struct nvhost_hwctx *ctx = container_of(ref, struct nvhost_hwctx, ref);
	nvhost_dbg_fn("");

	gk20a_busy(ctx->channel->dev);

	if (ctx->priv)
		gk20a_free_channel(ctx, true);

	gk20a_idle(ctx->channel->dev);

	kfree(ctx);
}

static struct nvhost_hwctx *gk20a_alloc_hwctx(struct nvhost_hwctx_handler *h,
					      struct nvhost_channel *ch)
{
	struct nvhost_hwctx *ctx;
	nvhost_dbg_fn("");

	/* it seems odd to be allocating a channel here but the
	 * t20/t30 notion of a channel is mapped on top of gk20a's
	 * channel.  this works because there is only one module
	 * under gk20a's host (gr).
	 */
	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
	if (!ctx)
		return NULL;

	kref_init(&ctx->ref);
	ctx->h = h;
	ctx->channel = ch;
	mutex_init(&ctx->error_notifier_mutex);

	return gk20a_open_channel(ch, ctx);
}

static void gk20a_get_hwctx(struct nvhost_hwctx *hwctx)
{
	nvhost_dbg_fn("");
	kref_get(&hwctx->ref);
}

static void gk20a_put_hwctx(struct nvhost_hwctx *hwctx)
{
	nvhost_dbg_fn("");
	kref_put(&hwctx->ref, gk20a_free_hwctx);
}

static void gk20a_save_push_hwctx(struct nvhost_hwctx *ctx,
				  struct nvhost_cdma *cdma)
{
	nvhost_dbg_fn("");
}

struct nvhost_hwctx_handler *
nvhost_gk20a_alloc_hwctx_handler(u32 syncpt, u32 waitbase,
				 struct nvhost_channel *ch)
{

	struct nvhost_hwctx_handler *h;
	nvhost_dbg_fn("");

	h = kmalloc(sizeof(*h), GFP_KERNEL);
	if (!h)
		return NULL;

	h->alloc = gk20a_alloc_hwctx;
	h->get   = gk20a_get_hwctx;
	h->put   = gk20a_put_hwctx;
	h->save_push = gk20a_save_push_hwctx;

	return h;
}

int nvhost_gk20a_prepare_poweroff(struct platform_device *dev)
{
	struct gk20a *g = get_gk20a(dev);
	int ret = 0;

	nvhost_dbg_fn("");

	if (!g->power_on)
		return 0;

        ret |= gk20a_channel_suspend(g);

	/*
	 * After this point, gk20a interrupts should not get
	 * serviced.
	 */
	if (g->irq_requested) {
		free_irq(gk20a_intr.start, g);
		g->irq_requested = false;
	}

        /* disable elpg before gr or fifo suspend */
        ret |= gk20a_pmu_destroy(g);
        ret |= gk20a_gr_suspend(g);
        ret |= gk20a_mm_suspend(g);
        ret |= gk20a_fifo_suspend(g);

	/* Disable GPCPLL */
	ret |= gk20a_suspend_clk_support(g);
	g->power_on = false;

	return ret;
}

#ifdef CONFIG_PM_GENERIC_DOMAINS
int nvhost_gk20a_domain_power_on(struct generic_pm_domain *domain)
{
	struct nvhost_device_data *pdata;

	pdata = container_of(domain, struct nvhost_device_data, pd);

	if (pdata->can_powergate) {
		mutex_lock(&pdata->lock);
		gk20a_power_on(pdata->pdev, get_gk20a(pdata->pdev));
		mutex_unlock(&pdata->lock);
	}

	return 0;
}

int nvhost_gk20a_domain_power_off(struct generic_pm_domain *domain)
{
	struct nvhost_device_data *pdata;

	pdata = container_of(domain, struct nvhost_device_data, pd);

	if (pdata->can_powergate) {
		mutex_lock(&pdata->lock);
		gk20a_power_off(pdata->pdev, get_gk20a(pdata->pdev));
		mutex_unlock(&pdata->lock);
	}

	return 0;
}
#endif

int nvhost_gk20a_finalize_poweron(struct platform_device *dev)
{
	struct gk20a *g = get_gk20a(dev);
	int err, nice_value;

	nvhost_dbg_fn("");

	if (g->power_on)
		return 0;

	nice_value = task_nice(current);
	set_user_nice(current, -20);

	if (!g->irq_requested) {
		err = request_threaded_irq(gk20a_intr.start,
				gk20a_intr_isr, gk20a_intr_thread,
				0, "gk20a", g);
		if (err) {
			dev_err(dev_from_gk20a(g),
				"failed to request stall intr irq @ %lld\n",
					(u64)gk20a_intr.start);
			goto done;
		}
		g->irq_requested = true;
	}

	g->power_on = true;

	gk20a_writel(g, mc_intr_en_1_r(),
		mc_intr_en_1_inta_disabled_f());

	gk20a_writel(g, mc_intr_mask_0_r(),
			mc_intr_0_pgraph_pending_f() |
			mc_intr_0_pfifo_pending_f() |
			mc_intr_0_priv_ring_pending_f() |
			mc_intr_0_ltc_pending_f() |
			mc_intr_0_pbus_pending_f());
	gk20a_writel(g, mc_intr_en_0_r(),
		mc_intr_en_0_inta_hardware_f());


	gk20a_writel(g, bus_intr_en_0_r(),
			bus_intr_en_0_pri_squash_m() |
			bus_intr_en_0_pri_fecserr_m() |
			bus_intr_en_0_pri_timeout_m());
	gk20a_reset_priv_ring(g);

	/* TBD: move this after graphics init in which blcg/slcg is enabled.
	   This function removes SlowdownOnBoot which applies 32x divider
	   on gpcpll bypass path. The purpose of slowdown is to save power
	   during boot but it also significantly slows down gk20a init on
	   simulation and emulation. We should remove SOB after graphics power
	   saving features (blcg/slcg) are enabled. For now, do it here. */
	err = gk20a_init_clk_support(g);
	if (err) {
		nvhost_err(&dev->dev, "failed to init gk20a clk");
		goto done;
	}

	err = gk20a_init_fifo_reset_enable_hw(g);
	if (err) {
		nvhost_err(&dev->dev, "failed to reset gk20a fifo");
		goto done;
	}

	err = gk20a_init_mm_support(g);
	if (err) {
		nvhost_err(&dev->dev, "failed to init gk20a mm");
		goto done;
	}

	err = gk20a_init_pmu_support(g);
	if (err) {
		nvhost_err(&dev->dev, "failed to init gk20a pmu");
		goto done;
	}

	err = gk20a_init_fifo_support(g);
	if (err) {
		nvhost_err(&dev->dev, "failed to init gk20a fifo");
		goto done;
	}

	err = gk20a_init_gr_support(g);
	if (err) {
		nvhost_err(&dev->dev, "failed to init gk20a gr");
		goto done;
	}

	err = gk20a_init_therm_support(g);
	if (err) {
		nvhost_err(&dev->dev, "failed to init gk20a therm");
		goto done;
	}

	wait_event(g->pmu.boot_wq, g->pmu.pmu_state == PMU_STATE_STARTED);

	gk20a_channel_resume(g);
	set_user_nice(current, nice_value);

done:
	return err;
}

static struct of_device_id tegra_gk20a_of_match[] = {
	{ .compatible = "nvidia,tegra124-gk20a",
		.data = (struct nvhost_device_data *)&tegra_gk20a_info },
	{ },
};

int tegra_gpu_get_max_state(struct thermal_cooling_device *cdev,
		unsigned long *max_state)
{
	struct cooling_device_gk20a *gk20a_gpufreq_device = cdev->devdata;

	*max_state = gk20a_gpufreq_device->gk20a_freq_table_size - 1;
	return 0;
}

int tegra_gpu_get_cur_state(struct thermal_cooling_device *cdev,
		unsigned long *cur_state)
{
	struct cooling_device_gk20a  *gk20a_gpufreq_device = cdev->devdata;

	*cur_state = gk20a_gpufreq_device->gk20a_freq_state;
	return 0;
}

int tegra_gpu_set_cur_state(struct thermal_cooling_device *c_dev,
		unsigned long cur_state)
{
	u32 target_freq;
	struct gk20a *g;
	struct gpufreq_table_data *gpu_cooling_table;
	struct cooling_device_gk20a *gk20a_gpufreq_device = c_dev->devdata;

	BUG_ON(cur_state >= gk20a_gpufreq_device->gk20a_freq_table_size);

	g = container_of(gk20a_gpufreq_device, struct gk20a, gk20a_cdev);

	gpu_cooling_table = tegra_gpufreq_table_get();
	target_freq = gpu_cooling_table[cur_state].frequency;

	/* ensure a query for state will get the proper value */
	gk20a_gpufreq_device->gk20a_freq_state = cur_state;

	gk20a_clk_set_rate(g, target_freq);

	return 0;
}

static struct thermal_cooling_device_ops tegra_gpu_cooling_ops = {
	.get_max_state = tegra_gpu_get_max_state,
	.get_cur_state = tegra_gpu_get_cur_state,
	.set_cur_state = tegra_gpu_set_cur_state,
};

static int gk20a_probe(struct platform_device *dev)
{
	struct gk20a *gk20a;
	int err;
	struct nvhost_device_data *pdata = NULL;
	struct cooling_device_gk20a *gpu_cdev = NULL;

	if (dev->dev.of_node) {
		const struct of_device_id *match;

		match = of_match_device(tegra_gk20a_of_match, &dev->dev);
		if (match)
			pdata = (struct nvhost_device_data *)match->data;
	} else
		pdata = (struct nvhost_device_data *)dev->dev.platform_data;

	if (!pdata) {
		dev_err(&dev->dev, "no platform data\n");
		return -ENODATA;
	}

	nvhost_dbg_fn("");
	pdata->pdev = dev;
	mutex_init(&pdata->lock);
	platform_set_drvdata(dev, pdata);

	err = nvhost_client_device_get_resources(dev);
	if (err)
		return err;

	nvhost_module_init(dev);

	gk20a = kzalloc(sizeof(struct gk20a), GFP_KERNEL);
	if (!gk20a) {
		dev_err(&dev->dev, "couldn't allocate gk20a support");
		return -ENOMEM;
	}

	set_gk20a(dev, gk20a);
	gk20a->dev = dev;
	gk20a->host = nvhost_get_host(dev);

	nvhost_init_gk20a_support(dev);

#ifdef CONFIG_PM_GENERIC_DOMAINS
	pdata->pd.name = "gk20a";

	pdata->pd.power_on = nvhost_gk20a_domain_power_on;
	pdata->pd.power_off = nvhost_gk20a_domain_power_off;
	err = nvhost_module_add_domain(&pdata->pd, dev);
#endif

	err = nvhost_client_device_init(dev);
	if (err) {
		nvhost_dbg_fn("failed to init client device for %s",
			      dev->name);
		pm_runtime_put(&dev->dev);
		return err;
	}

	err = nvhost_as_init_device(dev);
	if (err) {
		nvhost_dbg_fn(
			"failed to init client address space device for %s",
			dev->name);
		return err;
	}

	gpu_cdev = &gk20a->gk20a_cdev;
	gpu_cdev->gk20a_freq_table_size = tegra_gpufreq_table_size_get();
	gpu_cdev->gk20a_freq_state = 0;
	gpu_cdev->g = gk20a;
	gpu_cdev->gk20a_cooling_dev =
		thermal_cooling_device_register("gk20a_cdev", gpu_cdev,
						&tegra_gpu_cooling_ops);

	gk20a->gr_idle_timeout_default =
			CONFIG_TEGRA_GRHOST_DEFAULT_TIMEOUT;
	gk20a->timeouts_enabled = true;

	/* Set up initial clock gating settings */
	gk20a->slcg_enabled = true;
	gk20a->blcg_enabled = true;
	gk20a->elcg_enabled = true;
	gk20a->elpg_enabled = true;

	gk20a_create_sysfs(dev);

#ifdef CONFIG_DEBUG_FS
	clk_gk20a_debugfs_init(dev);

	spin_lock_init(&gk20a->debugfs_lock);
	gk20a->mm.ltc_enabled = true;
	gk20a->mm.ltc_enabled_debug = true;
	gk20a->debugfs_ltc_enabled =
			debugfs_create_bool("ltc_enabled", S_IRUGO|S_IWUSR,
				 pdata->debugfs,
				 &gk20a->mm.ltc_enabled_debug);
	gk20a->mm.ltc_enabled_debug = true;
	gk20a->debugfs_gr_idle_timeout_default =
			debugfs_create_u32("gr_idle_timeout_default_us",
					S_IRUGO|S_IWUSR, pdata->debugfs,
					 &gk20a->gr_idle_timeout_default);
	gk20a->debugfs_timeouts_enabled =
			debugfs_create_bool("timeouts_enabled",
					S_IRUGO|S_IWUSR,
					pdata->debugfs,
					&gk20a->timeouts_enabled);
	gk20a_pmu_debugfs_init(dev);
#endif

	spin_lock_init(&gk20a->mc_enable_lock);

	init_waitqueue_head(&gk20a->gr.init_wq);
	init_waitqueue_head(&gk20a->pmu.boot_wq);

	return 0;
}

static int __exit gk20a_remove(struct platform_device *dev)
{
	struct gk20a *g = get_gk20a(dev);
	nvhost_dbg_fn("");

	if (g->remove_support)
		g->remove_support(dev);

	set_gk20a(dev, 0);
#ifdef CONFIG_DEBUG_FS
	debugfs_remove(g->debugfs_ltc_enabled);
	debugfs_remove(g->debugfs_gr_idle_timeout_default);
	debugfs_remove(g->debugfs_timeouts_enabled);
#endif

	kfree(g);

#ifdef CONFIG_PM_RUNTIME
	pm_runtime_put(&dev->dev);
	pm_runtime_disable(&dev->dev);
#else
	nvhost_module_disable_clk(&dev->dev);
#endif

	return 0;
}

static struct platform_driver gk20a_driver = {
	.probe = gk20a_probe,
	.remove = __exit_p(gk20a_remove),
	.driver = {
		.owner = THIS_MODULE,
		.name = "gk20a",
#ifdef CONFIG_OF
		.of_match_table = tegra_gk20a_of_match,
#endif
#ifdef CONFIG_PM
		.pm = &nvhost_module_pm_ops,
#endif
	}
};

static int __init gk20a_init(void)
{
	return platform_driver_register(&gk20a_driver);
}

static void __exit gk20a_exit(void)
{
	platform_driver_unregister(&gk20a_driver);
}

void gk20a_busy(struct platform_device *pdev)
{
	struct nvhost_device_data *pdata = platform_get_drvdata(pdev);
	pm_runtime_get_sync(&pdev->dev);
	if (pdata->busy)
		pdata->busy(pdev);
}

void gk20a_idle(struct platform_device *pdev)
{
	struct nvhost_device_data *pdata = platform_get_drvdata(pdev);
#ifdef CONFIG_PM_RUNTIME
	if (pdata->busy && atomic_read(&pdev->dev.power.usage_count) == 1)
		pdata->idle(pdev);
	pm_runtime_mark_last_busy(&pdev->dev);
	pm_runtime_put_sync_autosuspend(&pdev->dev);
#else
	if (pdata->idle)
		pdata->idle(dev);
#endif
}

void gk20a_disable(struct gk20a *g, u32 units)
{
	u32 pmc;

	nvhost_dbg(dbg_info, "pmc disable: %08x\n", units);

	spin_lock(&g->mc_enable_lock);
	pmc = gk20a_readl(g, mc_enable_r());
	pmc &= ~units;
	gk20a_writel(g, mc_enable_r(), pmc);
	spin_unlock(&g->mc_enable_lock);
}

void gk20a_enable(struct gk20a *g, u32 units)
{
	u32 pmc;

	nvhost_dbg(dbg_info, "pmc enable: %08x\n", units);

	spin_lock(&g->mc_enable_lock);
	pmc = gk20a_readl(g, mc_enable_r());
	pmc |= units;
	gk20a_writel(g, mc_enable_r(), pmc);
	spin_unlock(&g->mc_enable_lock);
	gk20a_readl(g, mc_enable_r());

	udelay(20);
}

void gk20a_reset(struct gk20a *g, u32 units)
{
	gk20a_disable(g, units);
	udelay(20);
	gk20a_enable(g, units);
}

module_init(gk20a_init);
module_exit(gk20a_exit);