DTLS: Fix ARQ bug, use openssl timeout. 4.0.84

pull/2257/head
winlin 4 years ago
parent 3c6e466280
commit 02aac0fea4

@ -186,6 +186,7 @@ Other documents:
## V4 changes
* v4.0, 2021-03-09, DTLS: Fix ARQ bug, use openssl timeout. 4.0.84
* v4.0, 2021-03-08, DTLS: Fix dead loop by duplicated Alert message. 4.0.83
* v4.0, 2021-03-08, Fix bug when client DTLS is passive. 4.0.82
* v4.0, 2021-03-03, Fix [#2106][bug #2106], [#2011][bug #2011], RTMP/AAC transcode to Opus bug. 4.0.81

@ -35,6 +35,7 @@ using namespace std;
#include <srs_app_utility.hpp>
#include <srs_kernel_rtc_rtp.hpp>
#include <srs_app_log.hpp>
#include <srs_kernel_utility.hpp>
#include <srtp2/srtp.h>
#include <openssl/ssl.h>
@ -43,6 +44,35 @@ using namespace std;
// Defined in HTTP/HTTPS client.
extern int srs_verify_callback(int preverify_ok, X509_STORE_CTX *ctx);
// Setup the openssl timeout for DTLS packet.
// @see https://www.openssl.org/docs/man1.1.1/man3/DTLS_set_timer_cb.html
//
// Use step timeout for ARQ, [50, 100, 200, 400, 800, 1600, 3200, 6400, 12800, 25600, 51200] in ms,
// then total timeout is sum([50, 100, 200, 400, 800, 1600, 3200, 6400, 12800, 25600, 51200]) = 102350ms.
//
// @remark The connection might be closed for timeout in about 30s by default, which stop the DTLS ARQ.
unsigned int dtls_timer_cb(SSL* dtls, unsigned int previous_us)
{
SrsDtlsImpl* dtls_impl = (SrsDtlsImpl*)SSL_get_ex_data(dtls, 0);
srs_assert(dtls_impl);
// Double the timeout. Note that it can be 0.
unsigned int timeout_us = previous_us * 2;
// If previous_us is 0, for example, the HelloVerifyRequest, we should response it ASAP.
// When got ServerHello, we should reset the timer.
if (previous_us == 0 || dtls_impl->should_reset_timer()) {
timeout_us = 50 * 1000; // in us
}
// Never exceed the max timeout.
timeout_us = srs_min(timeout_us, 30 * 1000 * 1000); // in us
srs_info("DTLS: ARQ timer cb timeout=%ums, previous=%ums", timeout_us, previous_us);
return timeout_us;
}
// Print the information of SSL, DTLS alert as such.
void ssl_on_info(const SSL* dtls, int where, int ret)
{
@ -377,8 +407,6 @@ SrsDtlsImpl::SrsDtlsImpl(ISrsDtlsCallback* callback)
callback_ = callback;
handshake_done_for_us = false;
last_outgoing_packet_cache = new uint8_t[kRtpPacketSize];
nn_last_outgoing_packet = 0;
nn_arq_packets = 0;
version_ = SrsDtlsVersionAuto;
@ -401,8 +429,6 @@ SrsDtlsImpl::~SrsDtlsImpl()
SSL_free(dtls);
dtls = NULL;
}
srs_freepa(last_outgoing_packet_cache);
}
srs_error_t SrsDtlsImpl::initialize(std::string version, std::string role)
@ -431,6 +457,19 @@ srs_error_t SrsDtlsImpl::initialize(std::string version, std::string role)
SSL_set_options(dtls, SSL_OP_NO_QUERY_MTU);
SSL_set_mtu(dtls, kRtpPacketSize);
// @see https://linux.die.net/man/3/openssl_version_number
// MM NN FF PP S
// 0x1010102fL = 0x1 01 01 02 fL // 1.1.1b release
// MM(major) = 0x1 // 1.*
// NN(minor) = 0x01 // 1.1.*
// FF(fix) = 0x01 // 1.1.1*
// PP(patch) = 'a' + 0x02 - 1 = 'b' // 1.1.1b *
// S(status) = 0xf = release // 1.1.1b release
// @note Status 0 for development, 1 to e for betas 1 to 14, and f for release.
#if OPENSSL_VERSION_NUMBER >= 0x1010102fL // 1.1.1b
DTLS_set_timer_cb(dtls, dtls_timer_cb);
#endif
if ((bio_in = BIO_new(BIO_s_mem())) == NULL) {
return srs_error_new(ERROR_OpenSslBIONew, "BIO_new in");
}
@ -461,6 +500,12 @@ srs_error_t SrsDtlsImpl::do_on_dtls(char* data, int nb_data)
{
srs_error_t err = srs_success;
// When already done, only for us, we still got message from client,
// it might be our response is lost, or application data.
if (handshake_done_for_us) {
srs_trace("DTLS: After done, got %d bytes", nb_data);
}
int r0 = 0;
// TODO: FIXME: Why reset it before writing?
if ((r0 = BIO_reset(bio_in)) != 1) {
@ -471,7 +516,7 @@ srs_error_t SrsDtlsImpl::do_on_dtls(char* data, int nb_data)
}
// Trace the detail of DTLS packet.
state_trace((uint8_t*)data, nb_data, true, r0, SSL_ERROR_NONE, false, false);
state_trace((uint8_t*)data, nb_data, true, r0, SSL_ERROR_NONE, false);
if ((r0 = BIO_write(bio_in, data, nb_data)) <= 0) {
// TODO: 0 or -1 maybe block, use BIO_should_retry to check.
@ -502,6 +547,18 @@ srs_error_t SrsDtlsImpl::do_on_dtls(char* data, int nb_data)
if (r1 != SSL_ERROR_WANT_READ && r1 != SSL_ERROR_WANT_WRITE) {
break;
}
// We got data in memory, which can not read by SSL_read, generally, it's handshake data.
uint8_t* data = NULL;
int size = BIO_get_mem_data(bio_out, (char**)&data);
// Logging when got SSL original data.
state_trace((uint8_t*)data, size, true, r0, r1, false);
if (size > 0 && (err = callback_->write_dtls_data(data, size)) != srs_success) {
return srs_error_wrap(err, "dtls send size=%u, data=[%s]", size,
srs_string_dumps_hex((char*)data, size, 32).c_str());
}
continue;
}
@ -537,18 +594,10 @@ srs_error_t SrsDtlsImpl::do_handshake()
// The data to send out to peer.
uint8_t* data = NULL;
int size = BIO_get_mem_data(bio_out, &data);
// Callback when got SSL original data.
bool cache = false;
on_ssl_out_data(data, size, cache);
state_trace((uint8_t*)data, size, false, r0, r1, cache, false);
int size = BIO_get_mem_data(bio_out, (char**)&data);
// Update the packet cache.
if (size > 0 && data != last_outgoing_packet_cache && size < kRtpPacketSize) {
memcpy(last_outgoing_packet_cache, data, size);
nn_last_outgoing_packet = size;
}
// Logging when got SSL original data.
state_trace((uint8_t*)data, size, false, r0, r1, false);
// Callback for the final output data, before send-out.
if ((err = on_final_out_data(data, size)) != srs_success) {
@ -569,7 +618,7 @@ srs_error_t SrsDtlsImpl::do_handshake()
return err;
}
void SrsDtlsImpl::state_trace(uint8_t* data, int length, bool incoming, int r0, int r1, bool cache, bool arq)
void SrsDtlsImpl::state_trace(uint8_t* data, int length, bool incoming, int r0, int r1, bool arq)
{
// change_cipher_spec(20), alert(21), handshake(22), application_data(23)
// @see https://tools.ietf.org/html/rfc2246#section-6.2.1
@ -588,8 +637,8 @@ void SrsDtlsImpl::state_trace(uint8_t* data, int length, bool incoming, int r0,
handshake_type = (uint8_t)data[13];
}
srs_trace("DTLS: %s %s, done=%u, cache=%u, arq=%u/%u, r0=%d, r1=%d, len=%u, cnt=%u, size=%u, hs=%u",
(is_dtls_client()? "Active":"Passive"), (incoming? "RECV":"SEND"), handshake_done_for_us, cache, arq,
srs_trace("DTLS: State %s %s, done=%u, arq=%u/%u, r0=%d, r1=%d, len=%u, cnt=%u, size=%u, hs=%u",
(is_dtls_client()? "Active":"Passive"), (incoming? "RECV":"SEND"), handshake_done_for_us, arq,
nn_arq_packets, r0, r1, length, content_type, size, handshake_type);
}
@ -640,15 +689,9 @@ SrsDtlsClientImpl::SrsDtlsClientImpl(ISrsDtlsCallback* callback) : SrsDtlsImpl(c
trd = NULL;
state_ = SrsDtlsStateInit;
// The first wait and base interval for ARQ.
arq_interval = 10 * SRS_UTIME_MILLISECONDS;
// Use step timeout for ARQ, the total timeout is sum(arq_to_ratios)*arq_interval.
// for example, if arq_interval is 10ms, arq_to_ratios is [3, 6, 9, 15, 20, 40, 80, 160],
// then total timeout is sum([3, 6, 9, 15, 20, 40, 80, 160]) * 10ms = 3330ms.
int ratios[] = {3, 6, 9, 15, 20, 40, 80, 160};
srs_assert(sizeof(arq_to_ratios) == sizeof(ratios));
memcpy(arq_to_ratios, ratios, sizeof(ratios));
// the max dtls retry num is 12 in openssl.
arq_max_retry = 12 * 2; // ARQ for ClientHello and Certificate.
reset_timer_ = true;
}
SrsDtlsClientImpl::~SrsDtlsClientImpl()
@ -672,60 +715,45 @@ srs_error_t SrsDtlsClientImpl::initialize(std::string version, std::string role)
}
srs_error_t SrsDtlsClientImpl::start_active_handshake()
{
return do_handshake();
}
srs_error_t SrsDtlsClientImpl::on_dtls(char* data, int nb_data)
{
srs_error_t err = srs_success;
// When got packet, stop the ARQ if server in the first ARQ state SrsDtlsStateServerHello.
// @note But for ARQ state, we should never stop the ARQ, for example, we are in the second ARQ sate
// SrsDtlsStateServerDone, but we got previous late wrong packet ServeHello, which is not the expect
// packet SessionNewTicket, we should never stop the ARQ thread.
if (state_ == SrsDtlsStateServerHello) {
stop_arq();
if ((err = do_handshake()) != srs_success) {
return srs_error_wrap(err, "start handshake");
}
if ((err = SrsDtlsImpl::on_dtls(data, nb_data)) != srs_success) {
return err;
if ((err = start_arq()) != srs_success) {
return srs_error_wrap(err, "start arq");
}
return err;
}
void SrsDtlsClientImpl::on_ssl_out_data(uint8_t*& data, int& size, bool& cached)
bool SrsDtlsClientImpl::should_reset_timer()
{
// DTLS client use ARQ thread to send cached packet.
cached = false;
bool v = reset_timer_;
reset_timer_ = false;
return v;
}
// Note that only handshake sending packets drives the state, neither ARQ nor the
// final-packets(after handshake done) drives it.
srs_error_t SrsDtlsClientImpl::on_final_out_data(uint8_t* data, int size)
{
srs_error_t err = srs_success;
// Driven ARQ and state for DTLS client.
// If we are sending client hello, change from init to new state.
if (state_ == SrsDtlsStateInit && size > 14 && data[13] == 1) {
if (state_ == SrsDtlsStateInit && size > 14 && data[0] == 22 && data[13] == 1) {
state_ = SrsDtlsStateClientHello;
}
// If we are sending certificate, change from SrsDtlsStateServerHello to new state.
if (state_ == SrsDtlsStateServerHello && size > 14 && data[13] == 11) {
if (state_ == SrsDtlsStateServerHello && size > 14 && data[0] == 22 && data[13] == 11) {
state_ = SrsDtlsStateClientCertificate;
}
// Try to start the ARQ for client.
if ((state_ == SrsDtlsStateClientHello || state_ == SrsDtlsStateClientCertificate)) {
if (state_ == SrsDtlsStateClientHello) {
state_ = SrsDtlsStateServerHello;
} else if (state_ == SrsDtlsStateClientCertificate) {
state_ = SrsDtlsStateServerDone;
}
if ((err = start_arq()) != srs_success) {
return srs_error_wrap(err, "start arq");
}
// When we send out the certificate, we should reset the timer.
reset_timer_ = true;
srs_info("DTLS: Reset the timer for ServerHello");
}
return err;
@ -735,8 +763,15 @@ srs_error_t SrsDtlsClientImpl::on_handshake_done()
{
srs_error_t err = srs_success;
// When handshake done, stop the ARQ.
// Ignore if done.
if (state_ == SrsDtlsStateClientDone) {
return err;
}
// Change to done state.
state_ = SrsDtlsStateClientDone;
// When handshake done, stop the ARQ.
stop_arq();
// Notify connection the DTLS is done.
@ -756,8 +791,6 @@ srs_error_t SrsDtlsClientImpl::start_arq()
{
srs_error_t err = srs_success;
srs_info("start arq, state=%u", state_);
// Dispose the previous ARQ thread.
srs_freep(trd);
trd = new SrsSTCoroutine("dtls", this, _srs_context->get_id());
@ -772,20 +805,23 @@ srs_error_t SrsDtlsClientImpl::start_arq()
void SrsDtlsClientImpl::stop_arq()
{
srs_info("stop arq, state=%u", state_);
srs_freep(trd);
srs_info("stop arq, done");
}
srs_error_t SrsDtlsClientImpl::cycle()
{
srs_error_t err = srs_success;
// Limit the max retry for ARQ.
for (int i = 0; i < (int)(sizeof(arq_to_ratios) / sizeof(int)); i++) {
srs_utime_t arq_to = arq_interval * arq_to_ratios[i];
srs_usleep(arq_to);
// Limit the max retry for ARQ, to avoid infinite loop.
// Note that we set the timeout to [50, 100, 200, 400, 800, 1600, 3200, 6400, 12800, 25600, 51200] in ms,
// but the actual timeout is limit to 1s:
// 50ms, 100ms, 200ms, 400ms, 800ms, (1000ms,600ms), (200ms,1000ms,1000ms,1000ms),
// (400ms,1000ms,1000ms,1000ms,1000ms,1000ms,1000ms), ...
// So when the max ARQ limit to 12 times, the max loop is about 103.
const int max_loop = 103;
int arq_count = 0;
for (int i = 0; arq_count < arq_max_retry && i < max_loop; i++) {
// We ignore any error for ARQ thread.
if ((err = trd->pull()) != srs_success) {
srs_freep(err);
@ -798,27 +834,57 @@ srs_error_t SrsDtlsClientImpl::cycle()
}
// For DTLS client ARQ, the state should be specified.
if (state_ != SrsDtlsStateServerHello && state_ != SrsDtlsStateServerDone) {
if (state_ != SrsDtlsStateClientHello && state_ != SrsDtlsStateClientCertificate) {
return err;
}
// Try to retransmit the packet.
uint8_t* data = last_outgoing_packet_cache;
int size = nn_last_outgoing_packet;
// If there is a timeout in progress, it sets *out to the time remaining
// and returns one. Otherwise, it returns zero.
int r0 = 0; timeval to = {0};
if ((r0 = DTLSv1_get_timeout(dtls, &to)) == 0) {
// No timeout, for example?, wait for a default 50ms.
srs_usleep(50 * SRS_UTIME_MILLISECONDS);
continue;
}
srs_utime_t timeout = to.tv_sec + to.tv_usec;
// There is timeout to wait, so we should wait, because there is no packet in openssl.
if (timeout > 0) {
// Note that if we use very small timeout, say 10ms, the client might got two ClientHello,
// then it confused and send HelloVerifyRequest(3) to check it, this is not the efficiency
// way, so we limit the min timeout here to make it faster.
// TODO: FIXME: Config it.
srs_usleep(srs_max(50 * SRS_UTIME_MILLISECONDS, timeout));
continue;
}
if (size) {
// Trace the detail of DTLS packet.
state_trace((uint8_t*)data, size, false, 1, SSL_ERROR_NONE, true, true);
nn_arq_packets++;
// The timeout is 0, so there must be a ARQ packet to transmit in openssl.
r0 = BIO_reset(bio_out); int r1 = SSL_get_error(dtls, r0);
if (r0 != 1) {
return srs_error_new(ERROR_OpenSslBIOReset, "BIO_reset r0=%d, r1=%d", r0, r1);
}
if ((err = callback_->write_dtls_data(data, size)) != srs_success) {
return srs_error_wrap(err, "dtls send size=%u, data=[%s]", size,
srs_string_dumps_hex((char*)data, size, 32).c_str());
}
// DTLSv1_handle_timeout is called when a DTLS handshake timeout expires. If no timeout
// had expired, it returns 0. Otherwise, it retransmits the previous flight of handshake
// messages and returns 1. If too many timeouts had expired without progress or an error
// occurs, it returns -1.
r0 = DTLSv1_handle_timeout(dtls); r1 = SSL_get_error(dtls, r0);
if (r0 != 1) {
return srs_error_new(ERROR_RTC_DTLS, "ARQ r0=%d, r1=%d", r0, r1);
}
srs_info("arq cycle, done=%u, state=%u, retry=%d, interval=%dms, to=%dms, size=%d, nn=%d", handshake_done_for_us,
state_, i, srsu2msi(arq_interval), srsu2msi(arq_to), size, nn_arq_packets);
// The data to send out to peer.
uint8_t* data = NULL;
int size = BIO_get_mem_data(bio_out, (char**)&data);
arq_count++;
nn_arq_packets++;
state_trace((uint8_t*)data, size, false, r0, r1, true);
if (size > 0 && (err = callback_->write_dtls_data(data, size)) != srs_success) {
return srs_error_wrap(err, "dtls send size=%u, data=[%s]", size,
srs_string_dumps_hex((char*)data, size, 32).c_str());
}
}
return err;
@ -848,23 +914,19 @@ srs_error_t SrsDtlsServerImpl::initialize(std::string version, std::string role)
srs_error_t SrsDtlsServerImpl::start_active_handshake()
{
// For DTLS server, we do nothing, because DTLS client drive it.
return srs_success;
}
void SrsDtlsServerImpl::on_ssl_out_data(uint8_t*& data, int& size, bool& cached)
bool SrsDtlsServerImpl::should_reset_timer()
{
// If outgoing packet is empty, we use the last cache.
// @remark Only for DTLS server, because DTLS client use ARQ thread to send cached packet.
if (size <= 0 && nn_last_outgoing_packet) {
size = nn_last_outgoing_packet;
data = last_outgoing_packet_cache;
nn_arq_packets++;
cached = true;
}
// For DTLS server, we never use timer for ARQ, because DTLS client drive it.
return false;
}
srs_error_t SrsDtlsServerImpl::on_final_out_data(uint8_t* data, int size)
{
// No ARQ, driven by DTLS client packets.
return srs_success;
}

@ -121,9 +121,6 @@ protected:
// Whether the handshake is done, for us only.
// @remark For us only, means peer maybe not done, we also need to handle the DTLS packet.
bool handshake_done_for_us;
// DTLS packet cache, only last out-going packet.
uint8_t* last_outgoing_packet_cache;
int nn_last_outgoing_packet;
// The stat for ARQ packets.
int nn_arq_packets;
public:
@ -132,16 +129,16 @@ public:
public:
virtual srs_error_t initialize(std::string version, std::string role);
virtual srs_error_t start_active_handshake() = 0;
virtual bool should_reset_timer() = 0;
virtual srs_error_t on_dtls(char* data, int nb_data);
protected:
srs_error_t do_on_dtls(char* data, int nb_data);
srs_error_t do_handshake();
void state_trace(uint8_t* data, int length, bool incoming, int r0, int r1, bool cache, bool arq);
void state_trace(uint8_t* data, int length, bool incoming, int r0, int r1, bool arq);
public:
srs_error_t get_srtp_key(std::string& recv_key, std::string& send_key);
void callback_by_ssl(std::string type, std::string desc);
protected:
virtual void on_ssl_out_data(uint8_t*& data, int& size, bool& cached) = 0;
virtual srs_error_t on_final_out_data(uint8_t* data, int size) = 0;
virtual srs_error_t on_handshake_done() = 0;
virtual bool is_dtls_client() = 0;
@ -155,18 +152,19 @@ private:
SrsCoroutine* trd;
// The DTLS-client state to drive the ARQ thread.
SrsDtlsState state_;
// The timeout for ARQ.
srs_utime_t arq_interval;
int arq_to_ratios[8];
// The max ARQ retry.
int arq_max_retry;
// Should we reset the timer?
// It's true when init, or in state ServerHello.
bool reset_timer_;
public:
SrsDtlsClientImpl(ISrsDtlsCallback* callback);
virtual ~SrsDtlsClientImpl();
public:
virtual srs_error_t initialize(std::string version, std::string role);
virtual srs_error_t start_active_handshake();
virtual srs_error_t on_dtls(char* data, int nb_data);
virtual bool should_reset_timer();
protected:
virtual void on_ssl_out_data(uint8_t*& data, int& size, bool& cached);
virtual srs_error_t on_final_out_data(uint8_t* data, int size);
virtual srs_error_t on_handshake_done();
virtual bool is_dtls_client();
@ -185,8 +183,8 @@ public:
public:
virtual srs_error_t initialize(std::string version, std::string role);
virtual srs_error_t start_active_handshake();
virtual bool should_reset_timer();
protected:
virtual void on_ssl_out_data(uint8_t*& data, int& size, bool& cached);
virtual srs_error_t on_final_out_data(uint8_t* data, int size);
virtual srs_error_t on_handshake_done();
virtual bool is_dtls_client();

@ -24,6 +24,6 @@
#ifndef SRS_CORE_VERSION4_HPP
#define SRS_CORE_VERSION4_HPP
#define SRS_VERSION4_REVISION 83
#define SRS_VERSION4_REVISION 84
#endif

Loading…
Cancel
Save