diff --git a/README.md b/README.md
index 1eadbe768e..9bc34de49e 100644
--- a/README.md
+++ b/README.md
@@ -91,6 +91,7 @@ All tests support the same set of arguments :
   * `-c,--check <check iteration count>` perform count iterations, checking correctness of results on each iteration. This can be quite slow on large numbers of GPUs. Default : 1.
   * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
   * `-G,--cudagraph <num graph launches>` Capture iterations as a CUDA graph and then replay specified number of times. Default : 0.
+  * `-F,--cache_flush <cache flush after every -F iteration>` Enable cache flush after every -F iteration. Default : 0 (No cache flush).
 
 ## Unit tests
 
diff --git a/src/common.cu b/src/common.cu
index 19c60913d8..4dcac1cbd9 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -123,7 +123,6 @@ static int enable_cache_flush = 0;
 
 extern "C" __global__ void flush_icache()
 {
-    printf("flush_icache called \n");
     asm __volatile__("s_icache_inv \n\t"
                      "s_nop 0 \n\t"
                      "s_nop 0 \n\t"