From 00b4d007da4e0b212e8c2bc63d5ba34ebe4da4ff Mon Sep 17 00:00:00 2001 From: Andrew Stone Date: Fri, 29 Jan 2021 21:52:15 -0500 Subject: [PATCH] add assembly language jumpless fillNext implementation that only accesses memory once per 32 bits. Inline the code into the interrupt handler. --- src/platforms/esp/32/clockless_rmt_esp32.cpp | 488 ++++++++++++++++--- src/platforms/esp/32/clockless_rmt_esp32.h | 4 +- 2 files changed, 415 insertions(+), 77 deletions(-) diff --git a/src/platforms/esp/32/clockless_rmt_esp32.cpp b/src/platforms/esp/32/clockless_rmt_esp32.cpp index 1b3b7d3f19a..9a73ef42c6c 100644 --- a/src/platforms/esp/32/clockless_rmt_esp32.cpp +++ b/src/platforms/esp/32/clockless_rmt_esp32.cpp @@ -36,11 +36,9 @@ static bool gInitialized = false; int ESP32RMTController::gMaxChannel; int ESP32RMTController::gMemBlocks; - ESP32RMTController::ESP32RMTController(int DATA_PIN, int T1, int T2, int T3, int maxChannel, int memBlocks) - : mPixelData(0), - mSize(0), - mCur(0), + : mPixelData(0), + mSize(0), mWhichHalf(0), mBuffer(0), mBufferSize(0), @@ -87,6 +85,7 @@ uint8_t * ESP32RMTController::getPixelBuffer(int size_in_bytes) if (mPixelData == 0) { mSize = size_in_bytes; mPixelData = (uint8_t *) malloc(mSize); + mEndPtr = mPixelData + size_in_bytes; } return mPixelData; } @@ -131,7 +130,7 @@ void ESP32RMTController::init(gpio_num_t pin) gTX_sem = xSemaphoreCreateBinary(); xSemaphoreGive(gTX_sem); } - + if ( ! FASTLED_RMT_BUILTIN_DRIVER) { // -- Allocate the interrupt if we have not done so yet. This // interrupt handler must work for all different kinds of @@ -241,11 +240,11 @@ void IRAM_ATTR ESP32RMTController::startOnChannel(int channel) // the pixel data and the RMT buffer mRMT_mem_start = & (RMTMEM.chan[mRMT_channel].data32[0].val); mRMT_mem_ptr = mRMT_mem_start; - mCur = 0; + mCurPtr = mPixelData; mWhichHalf = 0; mLastFill = 0; - // -- Fill both halves of the RMT buffer (a totaly of 64 bits of pixel data) + // -- Fill both halves of the RMT buffer (a total of 64 bits of pixel data) fillNext(false); fillNext(false); @@ -271,7 +270,7 @@ void IRAM_ATTR ESP32RMTController::tx_start() mLastFill = __clock_cycles(); } -// -- A controller is done +// -- A controller is done // This function is called when a controller finishes writing // its data. It is called either by the custom interrupt // handler (below), or as a callback from the built-in @@ -281,10 +280,6 @@ void IRAM_ATTR ESP32RMTController::doneOnChannel(rmt_channel_t channel, void * a { ESP32RMTController * pController = gOnChannel[channel]; - // -- Turn off output on the pin - // SZG: Do I really need to do this? - gpio_matrix_out(pController->mPin, 0x100, 0, 0); - // -- Turn off the interrupts // rmt_set_tx_intr_en(channel, false); // Inline the code for rmt_tx_stop, so it can be placed in IRAM @@ -312,7 +307,7 @@ void IRAM_ATTR ESP32RMTController::doneOnChannel(rmt_channel_t channel, void * a } } } - + // -- Custom interrupt handler // This interrupt handler handles two cases: a controller is // done writing its data, or a controller needs to fill the @@ -321,24 +316,230 @@ void IRAM_ATTR ESP32RMTController::interruptHandler(void *arg) { // -- The basic structure of this code is borrowed from the // interrupt handler in esp-idf/components/driver/rmt.c - uint32_t intr_st = RMT.int_st.val; - uint8_t channel; - - bool stuff_to_do = false; - for (channel = 0; channel < gMaxChannel; channel++) { - int tx_done_bit = channel * 3; - int tx_next_bit = channel + 24; + register uint32_t intr_st = RMT.int_st.val; + register int channel; - ESP32RMTController * pController = gOnChannel[channel]; + register uint32_t tx_done_bit = 1; + register uint32_t tx_next_bit = 1<<24; + for (channel = 0; channel < gMaxChannel; channel++, tx_done_bit <<= 3, tx_next_bit<<=1) { + register ESP32RMTController * pController = gOnChannel[channel]; if (pController != NULL) { - if (intr_st & BIT(tx_next_bit)) { - // -- More to send on this channel - pController->fillNext(true); - RMT.int_clr.val |= BIT(tx_next_bit); - } else { + if (intr_st & tx_next_bit) { + // inline fillNext implementation since function calls are expensive + register uint32_t one_val = pController->mOne.val; + register uint32_t zero_val = pController->mZero.val; + + // -- Use locals for speed + register uint32_t * pItem = (uint32_t *) pController->mRMT_mem_ptr; + + register unsigned char* end = pController->mCurPtr + PULSES_PER_FILL/8; + if (end > pController->mEndPtr) end = pController->mEndPtr; + register unsigned char* curPtr = pController->mCurPtr; + + while(curPtr < end) { + // -- Get the next four bytes of pixel data + register uint32_t pixeldata4 = *((uint32_t*) curPtr); + curPtr+=4; + // This assembly code writes the RMT pattern for all 32 bits of pixeldata4 into the RMT buffer. + // It achieves a jump-free 4 cycles per bit by operating as follows: + // First it shifts the target bit into the MSB (not necessary for the first bit) of reg %3 + // Then it executes 2 speculative move operations that copy the correct RMT pattern into a + // working register, based on the sign of %3. Since we shifted the target bit into MSB, that + // bit defines the sign. + // Finally we store the working register to memory, indexed by pItem with a specified offset. + // If the ESP32 was big endian, the offset would simply be incrementing, 0,4,8... However, + // the ESP32 is little endian, which means the bytes are backwards, but the bits within the + // bytes are forwards! + asm( + "movgez a11, %0, %3\n" // if the high bit is zero load the zero_val into reg + "movltz a11, %1, %3\n" // if its 1 load one_val instead + "s32i a11, %2, 96\n" // store a11 into *(pItem+offset). Offset is wierd because little endian + + "slli %3, %3, 1\n" // Shift next bit into bit 31 (sign bit) + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 100\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 104\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 108\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 112\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 116\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 120\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 124\n" + + // second byte + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 64\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 68\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 72\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 76\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 80\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 84\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 88\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 92\n" + + // third byte + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 32\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 36\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 40\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 44\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 48\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 52\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 56\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 60\n" + + // last byte + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 0\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 4\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 8\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 12\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 16\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 20\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 24\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 28\n" + : + : "r" (zero_val), "r" (one_val), "r" (pItem), "r" (pixeldata4) + : "a11"); + + pItem += 32; + } + if (end == pController->mEndPtr) *pItem++ = 0; // tell RMT we are done + pController->mCurPtr = curPtr; + + // -- Flip to the other half, resetting the pointer if necessary + pController->mWhichHalf++; + if (pController->mWhichHalf == 2) { + pItem = (uint32_t*) pController->mRMT_mem_start; + pController->mWhichHalf = 0; + } + + // -- Store the new pointer back into the object + pController->mRMT_mem_ptr = (volatile uint32_t*) pItem; + RMT.int_clr.val |= tx_next_bit; + } + else { // -- Transmission is complete on this channel - if (intr_st & BIT(tx_done_bit)) { - RMT.int_clr.val |= BIT(tx_done_bit); + if (intr_st & tx_done_bit) { + // Set pin output before toggling RMT + gpio_set_level(pController->mPin,0); + gpio_matrix_out(pController->mPin, 0x100 , 0,0); // SIG_GPIO_OUT_IDX + RMT.int_clr.val &= ~tx_next_bit; + RMT.int_clr.val |= tx_done_bit; doneOnChannel(rmt_channel_t(channel), 0); } } @@ -350,72 +551,207 @@ void IRAM_ATTR ESP32RMTController::interruptHandler(void *arg) // Puts 32 bits of pixel data into the next 32 slots in the RMT memory // Each data bit is represented by a 32-bit RMT item that specifies how // long to hold the signal high, followed by how long to hold it low. + void IRAM_ATTR ESP32RMTController::fillNext(bool check_time) { - uint32_t now = __clock_cycles(); - if (check_time) { - if (mLastFill != 0 and now > mLastFill) { - uint32_t delta = (now - mLastFill); - if (delta > mMaxCyclesPerFill) { - // Serial.print(delta); - // Serial.print(" BAIL "); - // Serial.println(mCur); - // rmt_tx_stop(mRMT_channel); - // Inline the code for rmt_tx_stop, so it can be placed in IRAM - /** -- Go back to the original strategy of just setting mCur = mSize - and letting the regular 'stop' process happen - * mRMT_mem_start = 0; - RMT.int_ena.val &= ~(1 << (mRMT_channel * 3)); - RMT.conf_ch[mRMT_channel].conf1.tx_start = 0; - RMT.conf_ch[mRMT_channel].conf1.mem_rd_rst = 1; - RMT.conf_ch[mRMT_channel].conf1.mem_rd_rst = 0; - */ - mCur = mSize; - } - } - } - mLastFill = now; - // -- Get the zero and one values into local variables register uint32_t one_val = mOne.val; register uint32_t zero_val = mZero.val; // -- Use locals for speed - volatile register uint32_t * pItem = mRMT_mem_ptr; - - for (register int i = 0; i < PULSES_PER_FILL/8; i++) { - if (mCur < mSize) { - - // -- Get the next four bytes of pixel data - register uint32_t pixeldata = mPixelData[mCur] << 24; - mCur++; - - // Shift bits out, MSB first, setting RMTMEM.chan[n].data32[x] to the - // rmt_item32_t value corresponding to the buffered bit value - for (register uint32_t j = 0; j < 8; j++) { - *pItem++ = (pixeldata & 0x80000000L) ? one_val : zero_val; - // Replaces: RMTMEM.chan[mRMT_channel].data32[mCurPulse].val = val; - - pixeldata <<= 1; - } - } else { - // -- No more data; signal to the RMT we are done by filling the - // rest of the buffer with zeros - *pItem++ = 0; - } + register uint32_t * pItem = (uint32_t *) mRMT_mem_ptr; + + register unsigned char* end = mCurPtr + PULSES_PER_FILL/8; + if (end > mEndPtr) end = mEndPtr; + register unsigned char* curPtr = mCurPtr; + + while(curPtr < end) { + // -- Get the next four bytes of pixel data + register uint32_t pixeldata4 = *((uint32_t*) curPtr); + curPtr+=4; + // This code is exactly as described above in the interrupt handler + asm( + "movgez a11, %0, %3\n" // if the high bit is zero load the zero_val into reg + "movltz a11, %1, %3\n" // if its 1 load one_val instead + "s32i a11, %2, 96\n" // store a11 into *(pItem+offset). Offset is wierd because little endian + + "slli %3, %3, 1\n" // Shift next bit into bit 31 (sign bit) + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 100\n" + + "slli %3, %3, 1\n" // Repeat 30 more times... + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 104\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 108\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 112\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 116\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 120\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 124\n" + + // Second byte + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 64\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 68\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 72\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 76\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 80\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 84\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 88\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 92\n" + + // Byte 2 + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 32\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 36\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 40\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 44\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 48\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 52\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 56\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 60\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 0\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 4\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 8\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 12\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 16\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 20\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 24\n" + + "slli %3, %3, 1\n" + "movgez a11, %0, %3\n" + "movltz a11, %1, %3\n" + "s32i a11, %2, 28\n" + : + : "r" (zero_val), "r" (one_val), "r" (pItem), "r" (pixeldata4) + : "a11"); + + pItem+=32; } + mCurPtr= curPtr; + if (end == mEndPtr) *pItem++ = 0; // -- Flip to the other half, resetting the pointer if necessary mWhichHalf++; if (mWhichHalf == 2) { - pItem = mRMT_mem_start; + pItem = (uint32_t*) mRMT_mem_start; mWhichHalf = 0; } // -- Store the new pointer back into the object - mRMT_mem_ptr = pItem; + mRMT_mem_ptr = (volatile uint32_t*) pItem; } + // -- Init pulse buffer // Set up the buffer that will hold all of the pulse items for this // controller. diff --git a/src/platforms/esp/32/clockless_rmt_esp32.h b/src/platforms/esp/32/clockless_rmt_esp32.h index 3a10f9caf26..bbce8aa8898 100644 --- a/src/platforms/esp/32/clockless_rmt_esp32.h +++ b/src/platforms/esp/32/clockless_rmt_esp32.h @@ -173,6 +173,7 @@ __attribute__ ((always_inline)) inline static uint32_t __clock_cycles() { #define NS_TO_CYCLES(n) ( (n) / NS_PER_CYCLE ) #define RMT_RESET_DURATION NS_TO_CYCLES(50000) + // -- Core or custom driver #ifndef FASTLED_RMT_BUILTIN_DRIVER #define FASTLED_RMT_BUILTIN_DRIVER false @@ -211,7 +212,8 @@ class ESP32RMTController // -- Pixel data uint8_t * mPixelData; int mSize; - int mCur; + uint8_t * mCurPtr; + uint8_t * mEndPtr; // -- RMT memory volatile uint32_t * mRMT_mem_ptr;