Files
mckernel/kernel/sdma.c
2018-06-13 00:31:40 +09:00

665 lines
18 KiB
C

/*
* Copyright(c) 2015, 2016 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
*
* GPL LICENSE SUMMARY
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* BSD LICENSE
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* - Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
#include <hfi1/ihk_hfi1_common.h>
#include <hfi1/user_sdma.h>
#include <hfi1/sdma.h>
#include <hfi1/common.h>
unsigned long hfi1_cap_mask = HFI1_CAP_MASK_DEFAULT;
/* must be a power of 2 >= 64 <= 32768 */
#define SDMA_DESCQ_CNT 2048
#define SDMA_DESC_INTR 64
#define INVALID_TAIL 0xffff
#define SDMA_TAIL_UPDATE_THRESH 0x1F
/**
* sdma_select_engine_vl() - select sdma engine
* @dd: devdata
* @selector: a spreading factor
* @vl: this vl
*
*
* This function returns an engine based on the selector and a vl. The
* mapping fields are protected by RCU.
*/
struct sdma_engine *sdma_select_engine_vl(
struct hfi1_devdata *dd,
u32 selector,
u8 vl)
{
struct sdma_vl_map *m;
struct sdma_map_elem *e;
struct sdma_engine *rval;
/* NOTE This should only happen if SC->VL changed after the initial
* checks on the QP/AH
* Default will return engine 0 below
*/
if (vl >= HFI1_MAX_VLS_SUPPORTED) {
rval = NULL;
goto done;
}
m = ACCESS_ONCE(dd->sdma_map);
if (unlikely(!m)) {
return &dd->per_sdma[0];
}
e = m->map[vl & m->mask];
rval = e->sde[selector & e->mask];
done:
rval = !rval ? &dd->per_sdma[0] : rval;
// trace_hfi1_sdma_engine_select(dd, selector, vl, rval->this_idx);
hfi1_cdbg(AIOWRITE, "-");
return rval;
}
/*
* sdma_select_user_engine() - select sdma engine based on user setup
* @dd: devdata
* @selector: a spreading factor
* @vl: this vl
*
* This function returns an sdma engine for a user sdma request.
* User defined sdma engine affinity setting is honored when applicable,
* otherwise system default sdma engine mapping is used. To ensure correct
* ordering, the mapping from <selector, vl> to sde must remain unchanged.
*/
struct sdma_engine *sdma_select_user_engine(struct hfi1_devdata *dd,
u32 selector, u8 vl)
{
/* Hash on rank for MPI jobs */
if (cpu_local_var(current)->proc->nr_processes > 1) {
return &dd->per_sdma[cpu_local_var(current)->proc->process_rank % 16];
}
else {
return &dd->per_sdma[ihk_mc_get_processor_id() % 16];
}
}
/*
* return the mode as indicated by the first
* descriptor in the tx.
*/
static inline u8 ahg_mode(struct sdma_txreq *tx)
{
return (tx->descp[0].qw[1] & SDMA_DESC1_HEADER_MODE_SMASK)
>> SDMA_DESC1_HEADER_MODE_SHIFT;
}
/**
* __sdma_txclean() - clean tx of mappings, descp *kmalloc's
* @dd: hfi1_devdata for unmapping
* @tx: tx request to clean
*
* This is used in the progress routine to clean the tx or
* by the ULP to toss an in-process tx build.
*
* The code can be called multiple times without issue.
*
*/
void __sdma_txclean(
struct hfi1_devdata *dd,
struct sdma_txreq *tx)
{
if (tx->num_desc) {
/* TODO: enable sdma_unmap_desc */
#if 0
u16 i;
u8 skip = 0, mode = ahg_mode(tx);
/* unmap first */
//sdma_unmap_desc(dd, &tx->descp[0]);
/* determine number of AHG descriptors to skip */
if (mode > SDMA_AHG_APPLY_UPDATE1)
skip = mode >> 1;
// for (i = 1 + skip; i < tx->num_desc; i++)
// sdma_unmap_desc(dd, &tx->descp[i]);
#endif
tx->num_desc = 0;
}
kfree(tx->coalesce_buf);
tx->coalesce_buf = NULL;
/* kmalloc'ed descp */
if (unlikely(tx->desc_limit > ARRAY_SIZE(tx->descs))) {
tx->desc_limit = ARRAY_SIZE(tx->descs);
kfree(tx->descp);
}
}
static inline void sdma_update_tail(struct sdma_engine *sde, u16 tail)
{
hfi1_cdbg(AIOWRITE, ".");
/* Commit writes to memory and advance the tail on the chip */
smp_wmb(); /* see get_txhead() */
writeq(tail, sde->tail_csr);
}
/*
* add the generation number into
* the qw1 and return
*/
static inline u64 add_gen(struct sdma_engine *sde, u64 qw1)
{
u8 generation = (sde->descq_tail >> sde->sdma_shift) & 3;
qw1 &= ~SDMA_DESC1_GENERATION_SMASK;
qw1 |= ((u64)generation & SDMA_DESC1_GENERATION_MASK)
<< SDMA_DESC1_GENERATION_SHIFT;
return qw1;
}
/*
* This routine submits the indicated tx
*
* Space has already been guaranteed and
* tail side of ring is locked.
*
* The hardware tail update is done
* in the caller and that is facilitated
* by returning the new tail.
*
* There is special case logic for ahg
* to not add the generation number for
* up to 2 descriptors that follow the
* first descriptor.
*
*/
static inline u16 submit_tx(struct sdma_engine *sde, struct sdma_txreq *tx)
{
int i;
u16 tail;
struct sdma_desc *descp = tx->descp;
u8 skip = 0, mode = ahg_mode(tx);
TP("+");
hfi1_cdbg(AIOWRITE, "+");
tail = sde->descq_tail & sde->sdma_mask;
sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]);
sde->descq[tail].qw[1] = cpu_to_le64(add_gen(sde, descp->qw[1]));
// trace_hfi1_sdma_descriptor(sde, descp->qw[0], descp->qw[1],
// tail, &sde->descq[tail]);
tail = ++sde->descq_tail & sde->sdma_mask;
descp++;
if (mode > SDMA_AHG_APPLY_UPDATE1)
skip = mode >> 1;
for (i = 1; i < tx->num_desc; i++, descp++) {
u64 qw1;
TP("submitting descs qw[0] = %lu, qw[1] = %lu \n", descp->qw[0], descp->qw[1]);
sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]);
if (skip) {
/* edits don't have generation */
qw1 = descp->qw[1];
skip--;
} else {
/* replace generation with real one for non-edits */
qw1 = add_gen(sde, descp->qw[1]);
}
sde->descq[tail].qw[1] = cpu_to_le64(qw1);
// trace_hfi1_sdma_descriptor(sde, descp->qw[0], qw1,
// tail, &sde->descq[tail]);
tail = ++sde->descq_tail & sde->sdma_mask;
}
tx->next_descq_idx = tail;
#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
tx->sn = sde->tail_sn++;
// trace_hfi1_sdma_in_sn(sde, tx->sn);
WARN_ON_ONCE(sde->tx_ring[sde->tx_tail & sde->sdma_mask]);
#endif
sde->tx_ring[sde->tx_tail++ & sde->sdma_mask] = tx;
sde->desc_avail -= tx->num_desc;
TP("-");
hfi1_cdbg(AIOWRITE, "-");
return tail;
}
/*
* Check for progress
*/
static int sdma_check_progress(
struct sdma_engine *sde,
struct iowait_work *wait,
struct sdma_txreq *tx,
bool pkts_sent)
{
int ret;
hfi1_cdbg(AIOWRITE, "+");
sde->desc_avail = sdma_descq_freecnt(sde);
if (tx->num_desc <= sde->desc_avail)
return -EAGAIN;
/* pulse the head_lock */
if (wait && iowait_ioww_to_iow(wait)->sleep) {
unsigned seq;
seq = raw_seqcount_begin(
(const seqcount_t *)&sde->head_lock.seqcount);
ret = wait->iow->sleep(sde, wait, tx, seq, pkts_sent);
if (ret == -EAGAIN)
sde->desc_avail = sdma_descq_freecnt(sde);
} else {
ret = -EBUSY;
}
hfi1_cdbg(AIOWRITE, "-");
return ret;
}
/**
* sdma_send_txlist() - submit a list of tx req to ring
* @sde: sdma engine to use
* @wait: SE wait structure to use when full (may be NULL)
* @tx_list: list of sdma_txreqs to submit
* @count: pointer to a u32 which, after return will contain the total number of
* sdma_txreqs removed from the tx_list. This will include sdma_txreqs
* whose SDMA descriptors are submitted to the ring and the sdma_txreqs
* which are added to SDMA engine flush list if the SDMA engine state is
* not running.
*
* The call submits the list into the ring.
*
* If the iowait structure is non-NULL and not equal to the iowait list
* the unprocessed part of the list will be appended to the list in wait.
*
* In all cases, the tx_list will be updated so the head of the tx_list is
* the list of descriptors that have yet to be transmitted.
*
* The intent of this call is to provide a more efficient
* way of submitting multiple packets to SDMA while holding the tail
* side locking.
*
* Return:
* 0 - Success,
* -EINVAL - sdma_txreq incomplete, -EBUSY - no space in ring (wait == NULL)
* -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
*/
int sdma_send_txlist(struct sdma_engine *sde, struct iowait_work *wait,
struct list_head *tx_list, u32 *count_out)
{
struct sdma_txreq *tx, *tx_next;
int ret = 0;
unsigned long flags;
u16 tail = INVALID_TAIL;
u32 submit_count = 0, flush_count = 0, total_count;
TP("+");
hfi1_cdbg(AIOWRITE, "+");
spin_lock_irqsave(&sde->tail_lock, flags);
retry:
list_for_each_entry_safe(tx, tx_next, tx_list, list) {
tx->wait = iowait_ioww_to_iow(wait);
if (unlikely(!__sdma_running(sde)))
goto unlock_noconn;
if (unlikely(tx->num_desc > sde->desc_avail))
goto nodesc;
if (unlikely(tx->tlen)) {
ret = -EINVAL;
goto update_tail;
}
list_del_init(&tx->list);
tail = submit_tx(sde, tx);
submit_count++;
if (tail != INVALID_TAIL &&
(submit_count & SDMA_TAIL_UPDATE_THRESH) == 0) {
sdma_update_tail(sde, tail);
tail = INVALID_TAIL;
}
}
update_tail:
TP("+ update_tail:");
total_count = submit_count + flush_count;
if (wait)
iowait_sdma_add(iowait_ioww_to_iow(wait), total_count);
if (tail != INVALID_TAIL)
sdma_update_tail(sde, tail);
spin_unlock_irqrestore(&sde->tail_lock, flags);
*count_out = total_count;
hfi1_cdbg(AIOWRITE, "-");
TP("-");
return ret;
unlock_noconn:
TP("+ unlock_noconn:");
spin_lock(&sde->flushlist_lock);
list_for_each_entry_safe(tx, tx_next, tx_list, list) {
tx->wait = iowait_ioww_to_iow(wait);
list_del_init(&tx->list);
tx->next_descq_idx = 0;
#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
tx->sn = sde->tail_sn++;
// trace_hfi1_sdma_in_sn(sde, tx->sn);
#endif
list_add_tail(&tx->list, &sde->flushlist);
flush_count++;
iowait_inc_wait_count(wait, tx->num_desc);
}
spin_unlock(&sde->flushlist_lock);
// TODO: schedule_work
//schedule_work(&sde->flush_worker);
ret = -ECOMM;
goto update_tail;
nodesc:
TP("+ nodesc:");
ret = sdma_check_progress(sde, wait, tx, submit_count > 0);
if (ret == -EAGAIN) {
ret = 0;
goto retry;
}
sde->descq_full_count++;
goto update_tail;
}
/*
* _extend_sdma_tx_descs() - helper to extend txreq
*
* This is called once the initial nominal allocation
* of descriptors in the sdma_txreq is exhausted.
*
* The code will bump the allocation up to the max
* of MAX_DESC (64) descriptors. There doesn't seem
* much point in an interim step. The last descriptor
* is reserved for coalesce buffer in order to support
* cases where input packet has >MAX_DESC iovecs.
*
*/
static int _extend_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
{
int i;
/* Handle last descriptor */
if (unlikely((tx->num_desc == (MAX_DESC - 1)))) {
/* if tlen is 0, it is for padding, release last descriptor */
if (!tx->tlen) {
tx->desc_limit = MAX_DESC;
} else if (!tx->coalesce_buf) {
/* allocate coalesce buffer with space for padding */
tx->coalesce_buf = kmalloc(tx->tlen + sizeof(u32),
GFP_ATOMIC);
if (!tx->coalesce_buf)
goto enomem;
tx->coalesce_idx = 0;
}
return 0;
}
if (unlikely(tx->num_desc == MAX_DESC))
goto enomem;
tx->descp = kmalloc_array(
MAX_DESC,
sizeof(struct sdma_desc),
GFP_ATOMIC);
if (!tx->descp)
goto enomem;
/* reserve last descriptor for coalescing */
tx->desc_limit = MAX_DESC - 1;
/* copy ones already built */
for (i = 0; i < tx->num_desc; i++)
tx->descp[i] = tx->descs[i];
return 0;
enomem:
__sdma_txclean(dd, tx);
return -ENOMEM;
}
/*
* ext_coal_sdma_tx_descs() - extend or coalesce sdma tx descriptors
*
* This is called once the initial nominal allocation of descriptors
* in the sdma_txreq is exhausted.
*
* This function calls _extend_sdma_tx_descs to extend or allocate
* coalesce buffer. If there is a allocated coalesce buffer, it will
* copy the input packet data into the coalesce buffer. It also adds
* coalesce buffer descriptor once when whole packet is received.
*
* Return:
* <0 - error
* 0 - coalescing, don't populate descriptor
* 1 - continue with populating descriptor
*/
int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx,
int type, void *kvaddr, struct page *page,
unsigned long offset, u16 len)
{
//TODO: ext_coal_sdma_tx_descs
#ifdef __HFI1_ORIG__
int pad_len, rval;
dma_addr_t addr;
rval = _extend_sdma_tx_descs(dd, tx);
if (rval) {
__sdma_txclean(dd, tx);
return rval;
}
/* If coalesce buffer is allocated, copy data into it */
if (tx->coalesce_buf) {
if (type == SDMA_MAP_NONE) {
__sdma_txclean(dd, tx);
return -EINVAL;
}
if (type == SDMA_MAP_PAGE) {
kvaddr = kmap(page);
kvaddr += offset;
} else if (WARN_ON(!kvaddr)) {
__sdma_txclean(dd, tx);
return -EINVAL;
}
memcpy(tx->coalesce_buf + tx->coalesce_idx, kvaddr, len);
tx->coalesce_idx += len;
if (type == SDMA_MAP_PAGE)
kunmap(page);
/* If there is more data, return */
if (tx->tlen - tx->coalesce_idx)
return 0;
/* Whole packet is received; add any padding */
pad_len = tx->packet_len & (sizeof(u32) - 1);
if (pad_len) {
pad_len = sizeof(u32) - pad_len;
memset(tx->coalesce_buf + tx->coalesce_idx, 0, pad_len);
/* padding is taken care of for coalescing case */
tx->packet_len += pad_len;
tx->tlen += pad_len;
}
/* dma map the coalesce buffer */
addr = dma_map_single(&dd->pcidev->dev,
tx->coalesce_buf,
tx->tlen,
DMA_TO_DEVICE);
if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
__sdma_txclean(dd, tx);
return -ENOSPC;
}
/* Add descriptor for coalesce buffer */
tx->desc_limit = MAX_DESC;
return _sdma_txadd_daddr(dd, SDMA_MAP_SINGLE, tx,
addr, tx->tlen);
}
#endif /* __HFI1_ORIG__ */
return 1;
}
/* tx not dword sized - pad */
int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
{
int rval = 0;
tx->num_desc++;
if ((unlikely(tx->num_desc == tx->desc_limit))) {
rval = _extend_sdma_tx_descs(dd, tx);
if (rval) {
__sdma_txclean(dd, tx);
return rval;
}
}
/* finish the one just added */
make_tx_sdma_desc(
tx,
SDMA_MAP_NONE,
dd->sdma_pad_phys,
sizeof(u32) - (tx->packet_len & (sizeof(u32) - 1)));
_sdma_close_tx(dd, tx);
return rval;
}
/*
* Add ahg to the sdma_txreq
*
* The logic will consume up to 3
* descriptors at the beginning of
* sdma_txreq.
*/
void _sdma_txreq_ahgadd(
struct sdma_txreq *tx,
u8 num_ahg,
u8 ahg_entry,
u32 *ahg,
u8 ahg_hlen)
{
u32 i, shift = 0, desc = 0;
u8 mode;
WARN_ON_ONCE(num_ahg > 9 || (ahg_hlen & 3) || ahg_hlen == 4);
/* compute mode */
if (num_ahg == 1)
mode = SDMA_AHG_APPLY_UPDATE1;
else if (num_ahg <= 5)
mode = SDMA_AHG_APPLY_UPDATE2;
else
mode = SDMA_AHG_APPLY_UPDATE3;
tx->num_desc++;
/* initialize to consumed descriptors to zero */
switch (mode) {
case SDMA_AHG_APPLY_UPDATE3:
tx->num_desc++;
tx->descs[2].qw[0] = 0;
tx->descs[2].qw[1] = 0;
/* FALLTHROUGH */
case SDMA_AHG_APPLY_UPDATE2:
tx->num_desc++;
tx->descs[1].qw[0] = 0;
tx->descs[1].qw[1] = 0;
break;
}
ahg_hlen >>= 2;
tx->descs[0].qw[1] |=
(((u64)ahg_entry & SDMA_DESC1_HEADER_INDEX_MASK)
<< SDMA_DESC1_HEADER_INDEX_SHIFT) |
(((u64)ahg_hlen & SDMA_DESC1_HEADER_DWS_MASK)
<< SDMA_DESC1_HEADER_DWS_SHIFT) |
(((u64)mode & SDMA_DESC1_HEADER_MODE_MASK)
<< SDMA_DESC1_HEADER_MODE_SHIFT) |
(((u64)ahg[0] & SDMA_DESC1_HEADER_UPDATE1_MASK)
<< SDMA_DESC1_HEADER_UPDATE1_SHIFT);
for (i = 0; i < (num_ahg - 1); i++) {
if (!shift && !(i & 2))
desc++;
tx->descs[desc].qw[!!(i & 2)] |=
(((u64)ahg[i + 1])
<< shift);
shift = (shift + 32) & 63;
}
}
/**
* sdma_ahg_alloc - allocate an AHG entry
* @sde: engine to allocate from
*
* Return:
* 0-31 when successful, -EOPNOTSUPP if AHG is not enabled,
* -ENOSPC if an entry is not available
*/
int sdma_ahg_alloc(struct sdma_engine *sde)
{
int nr;
int oldbit;
if (!sde) {
trace_hfi1_ahg_allocate(sde, -EINVAL);
return -EINVAL;
}
while (1) {
nr = ffz(ACCESS_ONCE(sde->ahg_bits));
if (nr > 31) {
trace_hfi1_ahg_allocate(sde, -ENOSPC);
return -ENOSPC;
}
oldbit = test_and_set_bit(nr, &sde->ahg_bits);
if (!oldbit)
break;
cpu_relax();
}
trace_hfi1_ahg_allocate(sde, nr);
return nr;
}
/**
* sdma_ahg_free - free an AHG entry
* @sde: engine to return AHG entry
* @ahg_index: index to free
*
* This routine frees the indicate AHG entry.
*/
void sdma_ahg_free(struct sdma_engine *sde, int ahg_index)
{
if (!sde)
return;
trace_hfi1_ahg_deallocate(sde, ahg_index);
if (ahg_index < 0 || ahg_index > 31)
return;
clear_bit(ahg_index, &sde->ahg_bits);
}