Wait an appropriate delay before SyncWorker retries after soft errors (bitfireAT/davx5#337)

* Use appropriate delayUntil value for retrying syncs on 503s.

Crop server suggested retryAfter value to self defined min/max values and use a reasonable default value if non-existent.

* Add tests for getDelayUntil

* Wait appropriate delay, before retrying sync after a soft error happened

* Increase max and default sync delays after soft errors

* Increase initial backoff time for SyncWorker retries

* Minor getDelayUntil changes

* Minor changes

- store delayUntil in seconds
- pass duration instead of timestamp to Thread.sleep
- other minor changes

* Use Instant instead of Long timestamps

* Correct calculation of blocking duration

* Indicate soft error occurred on 503 server message

---------

Co-authored-by: Ricki Hirner <hirner@bitfire.at>
This commit is contained in:
Sunik Kupfer 2023-09-05 16:43:30 +02:00 committed by Ricki Hirner
parent b670979f12
commit 5640250359
4 changed files with 89 additions and 10 deletions

View file

@ -11,12 +11,21 @@ import androidx.work.WorkInfo
import androidx.work.WorkManager
import androidx.work.WorkQuery
import org.jetbrains.annotations.TestOnly
import org.junit.Assert.assertTrue
import java.util.concurrent.CountDownLatch
import java.util.concurrent.TimeUnit
import java.util.concurrent.TimeoutException
object TestUtils {
fun assertWithin(expected: Long, actual: Long, tolerance: Long) {
val absDifference = Math.abs(expected - actual)
assertTrue(
"$actual not within ($expected ± $tolerance)",
absDifference <= tolerance
)
}
@TestOnly
fun workScheduledOrRunning(context: Context, workerName: String): Boolean =
workInStates(context, workerName, listOf(

View file

@ -19,6 +19,7 @@ import at.bitfire.dav4jvm.Response
import at.bitfire.dav4jvm.Response.HrefRelation
import at.bitfire.dav4jvm.property.GetETag
import at.bitfire.davdroid.R
import at.bitfire.davdroid.TestUtils.assertWithin
import at.bitfire.davdroid.db.Credentials
import at.bitfire.davdroid.db.SyncState
import at.bitfire.davdroid.network.HttpClient
@ -30,8 +31,15 @@ import okhttp3.Protocol
import okhttp3.internal.http.StatusLine
import okhttp3.mockwebserver.MockResponse
import okhttp3.mockwebserver.MockWebServer
import org.junit.*
import org.junit.Assert.*
import org.junit.After
import org.junit.AfterClass
import org.junit.Assert.assertEquals
import org.junit.Assert.assertFalse
import org.junit.Assert.assertTrue
import org.junit.Before
import org.junit.BeforeClass
import org.junit.Rule
import org.junit.Test
import java.time.Instant
import java.util.concurrent.TimeUnit
import javax.inject.Inject
@ -112,6 +120,30 @@ class SyncManagerTest {
}
@Test
fun testGetDelayUntil_defaultOnNull() {
val now = Instant.now()
val delayUntil = SyncManager.getDelayUntil(null).epochSecond
val default = now.plusSeconds(SyncManager.DELAY_UNTIL_DEFAULT).epochSecond
assertWithin(default, delayUntil, 5)
}
@Test
fun testGetDelayUntil_reducesToMax() {
val now = Instant.now()
val delayUntil = SyncManager.getDelayUntil(now.plusSeconds(10*24*60*60)).epochSecond
val max = now.plusSeconds(SyncManager.DELAY_UNTIL_MAX).epochSecond
assertWithin(max, delayUntil, 5)
}
@Test
fun testGetDelayUntil_increasesToMin() {
val delayUntil = SyncManager.getDelayUntil(Instant.EPOCH).epochSecond
val min = Instant.now().plusSeconds(SyncManager.DELAY_UNTIL_MIN).epochSecond
assertWithin(min, delayUntil, 5)
}
private fun queryCapabilitiesResponse(cTag: String? = null): MockResponse {
val body = StringBuilder()
body.append("<?xml version=\"1.0\" encoding=\"utf-8\" ?>\n" +
@ -148,7 +180,7 @@ class SyncManagerTest {
.plusSeconds(60)
.toEpochMilli()
// 5 sec tolerance for test
assertTrue(result.delayUntil > (expected - 5000) && result.delayUntil < (expected + 5000))
assertWithin(expected, result.delayUntil*1000, 5000)
}
@Test

View file

@ -54,6 +54,7 @@ import java.io.InterruptedIOException
import java.lang.ref.WeakReference
import java.net.HttpURLConnection
import java.security.cert.CertificateException
import java.time.Instant
import java.util.*
import java.util.concurrent.LinkedBlockingQueue
import java.util.concurrent.ThreadPoolExecutor
@ -87,8 +88,39 @@ abstract class SyncManager<ResourceType: LocalResource<*>, out CollectionType: L
companion object {
const val DEBUG_INFO_MAX_RESOURCE_DUMP_SIZE = 100*FileUtils.ONE_KB.toInt()
const val MAX_MULTIGET_RESOURCES = 10
const val DELAY_UNTIL_DEFAULT = 15*60L // 15 min
const val DELAY_UNTIL_MIN = 1*60L // 1 min
const val DELAY_UNTIL_MAX = 2*60*60L // 2 hours
/**
* Returns appropriate sync retry delay in seconds, considering the servers suggestion
* ([DELAY_UNTIL_DEFAULT] if no server suggestion).
*
* Takes current time into account to calculate intervals. Interval
* will be restricted to values between [DELAY_UNTIL_MIN] and [DELAY_UNTIL_MAX].
*
* @param retryAfter optional server suggestion on how long to wait before retrying
* @return until when to wait before sync can be retried
*/
fun getDelayUntil(retryAfter: Instant?): Instant {
val now = Instant.now()
if (retryAfter == null)
return now.plusSeconds(DELAY_UNTIL_DEFAULT)
// take server suggestion, but restricted to plausible min/max values
val min = now.plusSeconds(DELAY_UNTIL_MIN)
val max = now.plusSeconds(DELAY_UNTIL_MAX)
return when {
min > retryAfter -> min
max < retryAfter -> max
else -> retryAfter
}
}
var _workDispatcher: WeakReference<CoroutineDispatcher>? = null
/**
* We use our own dispatcher to
@ -109,6 +141,7 @@ abstract class SyncManager<ResourceType: LocalResource<*>, out CollectionType: L
).asCoroutineDispatcher()
return newDispatcher
}
}
init {
@ -274,9 +307,9 @@ abstract class SyncManager<ResourceType: LocalResource<*>, out CollectionType: L
// specific HTTP errors
is ServiceUnavailableException -> {
Logger.log.log(Level.WARNING, "Got 503 Service unavailable, trying again later", e)
e.retryAfter?.let { retryAfter ->
syncResult.delayUntil = retryAfter.toEpochMilli()
}
// determine when to retry
syncResult.delayUntil = getDelayUntil(e.retryAfter).epochSecond
syncResult.stats.numIoExceptions++ // Indicate a soft error occurred
}
// all others

View file

@ -5,7 +5,6 @@
package at.bitfire.davdroid.syncadapter
import android.accounts.Account
import android.app.PendingIntent
import android.content.ContentProviderClient
import android.content.ContentResolver
import android.content.Context
@ -42,7 +41,6 @@ import at.bitfire.davdroid.log.Logger
import at.bitfire.davdroid.network.ConnectionUtils.internetAvailable
import at.bitfire.davdroid.network.ConnectionUtils.wifiAvailable
import at.bitfire.davdroid.settings.AccountSettings
import at.bitfire.davdroid.ui.DebugInfoActivity
import at.bitfire.davdroid.ui.NotificationUtils
import at.bitfire.davdroid.ui.NotificationUtils.notifyIfPossible
import at.bitfire.davdroid.ui.account.WifiPermissionsActivity
@ -149,7 +147,7 @@ class SyncWorker @AssistedInject constructor(
.setExpedited(OutOfQuotaPolicy.RUN_AS_NON_EXPEDITED_WORK_REQUEST)
.setBackoffCriteria(
BackoffPolicy.EXPONENTIAL,
WorkRequest.MIN_BACKOFF_MILLIS,
WorkRequest.DEFAULT_BACKOFF_DELAY_MILLIS, // 30 sec
TimeUnit.MILLISECONDS
)
.setConstraints(constraints)
@ -275,7 +273,6 @@ class SyncWorker @AssistedInject constructor(
var syncThread: Thread? = null
override fun doWork(): Result {
// ensure we got the required arguments
val account = Account(
inputData.getString(ARG_ACCOUNT_NAME) ?: throw IllegalArgumentException("$ARG_ACCOUNT_NAME required"),
@ -357,6 +354,14 @@ class SyncWorker @AssistedInject constructor(
if (result.hasSoftError()) {
Logger.log.warning("Soft error while syncing: result=$result, stats=${result.stats}")
if (runAttemptCount < MAX_RUN_ATTEMPTS) {
val blockDuration = result.delayUntil - System.currentTimeMillis()/1000
Logger.log.warning("Waiting for $blockDuration seconds, before retrying ...")
// We block the SyncWorker here so that it won't be started by the sync framework immediately again.
// This should be replaced by proper work scheduling as soon as we don't depend on the sync framework anymore.
if (blockDuration > 0)
Thread.sleep(blockDuration*1000)
Logger.log.warning("Retrying on soft error (attempt $runAttemptCount of $MAX_RUN_ATTEMPTS)")
return Result.retry()
}