*** dbinc/repmgr.h	2007-10-31 10:23:52.000000000 -0700
--- dbinc/repmgr.h	2007-10-31 10:23:53.000000000 -0700
***************
*** 36,41 ****
--- 36,55 ----
  #endif
  
  /*
+  * The (arbitrary) maximum number of outgoing messages we're willing to hold, on
+  * a queue per connection, waiting for TCP buffer space to become available in
+  * the kernel.  Rather than exceeding this limit, we simply discard additional
+  * messages (since this is always allowed by the replication protocol).
+  *    As a special dispensation, if a message is destined for a specific remote
+  * site (i.e., it's not a broadcast), then we first try blocking the sending
+  * thread, waiting for space to become available (though we only wait a limited
+  * time).  This is so as to be able to handle the immediate flood of (a
+  * potentially large number of) outgoing messages that replication generates, in
+  * a tight loop, when handling PAGE_REQ, LOG_REQ and ALL_REQ requests.
+  */
+ #define	OUT_QUEUE_LIMIT	10
+ 
+ /*
   * The system value is available from sysconf(_SC_HOST_NAME_MAX).
   * Historically, the maximum host name was 256.
   */
***************
*** 47,52 ****
--- 61,71 ----
  #define	MAX_SITE_LOC_STRING (MAXHOSTNAMELEN+20)
  typedef char SITE_STRING_BUFFER[MAX_SITE_LOC_STRING+1];
  
+ /* Default timeout values, in seconds. */
+ #define	DB_REPMGR_DEFAULT_ACK_TIMEOUT		(1 * US_PER_SEC)
+ #define	DB_REPMGR_DEFAULT_CONNECTION_RETRY	(30 * US_PER_SEC)
+ #define	DB_REPMGR_DEFAULT_ELECTION_RETRY	(10 * US_PER_SEC)
+ 
  struct __repmgr_connection;
      typedef struct __repmgr_connection REPMGR_CONNECTION;
  struct __repmgr_queue; typedef struct __repmgr_queue REPMGR_QUEUE;
***************
*** 171,178 ****
  #ifdef DB_WIN32
  	WSAEVENT event_object;
  #endif
! #define	CONN_CONNECTING	0x01	/* nonblocking connect in progress */
! #define	CONN_DEFUNCT	0x02	/* socket close pending */
  	u_int32_t flags;
  
  	/*
--- 190,198 ----
  #ifdef DB_WIN32
  	WSAEVENT event_object;
  #endif
! #define	CONN_CONGESTED	0x01	/* msg thread wait has exceeded timeout */
! #define	CONN_CONNECTING	0x02	/* nonblocking connect in progress */
! #define	CONN_DEFUNCT	0x04	/* socket close pending */
  	u_int32_t flags;
  
  	/*
***************
*** 180,189 ****
  	 * send() function's thread.  But if TCP doesn't have enough network
  	 * buffer space for us when we first try it, we instead allocate some
  	 * memory, and copy the message, and then send it as space becomes
! 	 * available in our main select() thread.
  	 */
  	OUT_Q_HEADER outbound_queue;
  	int out_queue_length;
  
  	/*
  	 * Input: while we're reading a message, we keep track of what phase
--- 200,215 ----
  	 * send() function's thread.  But if TCP doesn't have enough network
  	 * buffer space for us when we first try it, we instead allocate some
  	 * memory, and copy the message, and then send it as space becomes
! 	 * available in our main select() thread.  In some cases, if the queue
! 	 * gets too long we wait until it's drained, and then append to it.
! 	 * This condition variable's associated mutex is the normal per-repmgr
! 	 * db_rep->mutex, because that mutex is always held anyway whenever the
! 	 * output queue is consulted.
  	 */
  	OUT_Q_HEADER outbound_queue;
  	int out_queue_length;
+ 	cond_var_t drained;
+ 	int blockers;		/* ref count of msg threads waiting on us */
  
  	/*
  	 * Input: while we're reading a message, we keep track of what phase
*** dbinc_auto/int_def.in	2007-10-31 10:23:52.000000000 -0700
--- dbinc_auto/int_def.in	2007-10-31 10:23:52.000000000 -0700
***************
*** 1420,1425 ****
--- 1420,1428 ----
  #define	__repmgr_wake_waiting_senders __repmgr_wake_waiting_senders@DB_VERSION_UNIQUE_NAME@
  #define	__repmgr_await_ack __repmgr_await_ack@DB_VERSION_UNIQUE_NAME@
  #define	__repmgr_compute_wait_deadline __repmgr_compute_wait_deadline@DB_VERSION_UNIQUE_NAME@
+ #define	__repmgr_await_drain __repmgr_await_drain@DB_VERSION_UNIQUE_NAME@
+ #define	__repmgr_alloc_cond __repmgr_alloc_cond@DB_VERSION_UNIQUE_NAME@
+ #define	__repmgr_free_cond __repmgr_free_cond@DB_VERSION_UNIQUE_NAME@
  #define	__repmgr_init_sync __repmgr_init_sync@DB_VERSION_UNIQUE_NAME@
  #define	__repmgr_close_sync __repmgr_close_sync@DB_VERSION_UNIQUE_NAME@
  #define	__repmgr_net_init __repmgr_net_init@DB_VERSION_UNIQUE_NAME@
*** dbinc_auto/repmgr_ext.h	2007-10-31 10:23:52.000000000 -0700
--- dbinc_auto/repmgr_ext.h	2007-10-31 10:23:52.000000000 -0700
***************
*** 21,30 ****
  int __repmgr_handle_event __P((DB_ENV *, u_int32_t, void *));
  void __repmgr_stash_generation __P((DB_ENV *));
  int __repmgr_send __P((DB_ENV *, const DBT *, const DBT *, const DB_LSN *, int, u_int32_t));
! int __repmgr_send_one __P((DB_ENV *, REPMGR_CONNECTION *, u_int, const DBT *, const DBT *));
  int __repmgr_is_permanent __P((DB_ENV *, const DB_LSN *));
! int __repmgr_bust_connection __P((DB_ENV *, REPMGR_CONNECTION *, int));
! void __repmgr_cleanup_connection __P((DB_ENV *, REPMGR_CONNECTION *));
  int __repmgr_find_site __P((DB_ENV *, const char *, u_int));
  int __repmgr_pack_netaddr __P((DB_ENV *, const char *, u_int, ADDRINFO *, repmgr_netaddr_t *));
  int __repmgr_getaddr __P((DB_ENV *, const char *, u_int, int, ADDRINFO **));
--- 21,30 ----
  int __repmgr_handle_event __P((DB_ENV *, u_int32_t, void *));
  void __repmgr_stash_generation __P((DB_ENV *));
  int __repmgr_send __P((DB_ENV *, const DBT *, const DBT *, const DB_LSN *, int, u_int32_t));
! int __repmgr_send_one __P((DB_ENV *, REPMGR_CONNECTION *, u_int, const DBT *, const DBT *, int));
  int __repmgr_is_permanent __P((DB_ENV *, const DB_LSN *));
! int __repmgr_bust_connection __P((DB_ENV *, REPMGR_CONNECTION *));
! int __repmgr_cleanup_connection __P((DB_ENV *, REPMGR_CONNECTION *));
  int __repmgr_find_site __P((DB_ENV *, const char *, u_int));
  int __repmgr_pack_netaddr __P((DB_ENV *, const char *, u_int, ADDRINFO *, repmgr_netaddr_t *));
  int __repmgr_getaddr __P((DB_ENV *, const char *, u_int, int, ADDRINFO **));
***************
*** 39,44 ****
--- 39,47 ----
  int __repmgr_wake_waiting_senders __P((DB_ENV *));
  int __repmgr_await_ack __P((DB_ENV *, const DB_LSN *));
  void __repmgr_compute_wait_deadline __P((DB_ENV*, struct timespec *, db_timeout_t));
+ int __repmgr_await_drain __P((DB_ENV *, REPMGR_CONNECTION *, db_timeout_t));
+ int __repmgr_alloc_cond __P((cond_var_t *));
+ int __repmgr_free_cond __P((cond_var_t *));
  int __repmgr_init_sync __P((DB_ENV *, DB_REP *));
  int __repmgr_close_sync __P((DB_ENV *));
  int __repmgr_net_init __P((DB_ENV *, DB_REP *));
*** repmgr/repmgr_method.c	2007-10-31 10:23:52.000000000 -0700
--- repmgr/repmgr_method.c	2007-10-31 10:23:53.000000000 -0700
***************
*** 196,204 ****
  	int ret;
  
  	/* Set some default values. */
! 	db_rep->ack_timeout = 1 * US_PER_SEC;			/*  1 second */
! 	db_rep->connection_retry_wait = 30 * US_PER_SEC;	/* 30 seconds */
! 	db_rep->election_retry_wait = 10 * US_PER_SEC;		/* 10 seconds */
  	db_rep->config_nsites = 0;
  	db_rep->peer = DB_EID_INVALID;
  	db_rep->perm_policy = DB_REPMGR_ACKS_QUORUM;
--- 196,204 ----
  	int ret;
  
  	/* Set some default values. */
! 	db_rep->ack_timeout = DB_REPMGR_DEFAULT_ACK_TIMEOUT;
! 	db_rep->connection_retry_wait = DB_REPMGR_DEFAULT_CONNECTION_RETRY;
! 	db_rep->election_retry_wait = DB_REPMGR_DEFAULT_ELECTION_RETRY;
  	db_rep->config_nsites = 0;
  	db_rep->peer = DB_EID_INVALID;
  	db_rep->perm_policy = DB_REPMGR_ACKS_QUORUM;
***************
*** 238,243 ****
--- 238,244 ----
  	DB_ENV *dbenv;
  {
  	DB_REP *db_rep;
+ 	REPMGR_CONNECTION *conn;
  	int ret;
  
  	db_rep = dbenv->rep_handle;
***************
*** 254,259 ****
--- 255,266 ----
  
  	if ((ret = __repmgr_signal(&db_rep->queue_nonempty)) != 0)
  		goto unlock;
+ 
+ 	TAILQ_FOREACH(conn, &db_rep->connections, entries) {
+ 		if (conn->blockers > 0 &&
+ 		    ((ret = __repmgr_signal(&conn->drained)) != 0))
+ 			goto unlock;
+ 	}
  	UNLOCK_MUTEX(db_rep->mutex);
  
  	return (__repmgr_wake_main_thread(dbenv));
*** repmgr/repmgr_msg.c	2007-10-31 10:23:52.000000000 -0700
--- repmgr/repmgr_msg.c	2007-10-31 10:23:53.000000000 -0700
***************
*** 183,192 ****
  
  /*
   * Acknowledges a message.
-  *
-  * !!!
-  * Note that this cannot be called from the select() thread, in case we call
-  * __repmgr_bust_connection(..., FALSE).
   */
  static int
  ack_message(dbenv, generation, lsn)
--- 183,188 ----
***************
*** 227,235 ****
  		rec2.size = 0;
  
  		conn = site->ref.conn;
  		if ((ret = __repmgr_send_one(dbenv, conn, REPMGR_ACK,
! 		    &control2, &rec2)) == DB_REP_UNAVAIL)
! 			ret = __repmgr_bust_connection(dbenv, conn, FALSE);
  	}
  
  	UNLOCK_MUTEX(db_rep->mutex);
--- 223,236 ----
  		rec2.size = 0;
  
  		conn = site->ref.conn;
+ 		/*
+ 		 * It's hard to imagine anyone would care about a lost ack if
+ 		 * the path to the master is so congested as to need blocking;
+ 		 * so pass "blockable" argument as FALSE.
+ 		 */
  		if ((ret = __repmgr_send_one(dbenv, conn, REPMGR_ACK,
! 		    &control2, &rec2, FALSE)) == DB_REP_UNAVAIL)
! 			ret = __repmgr_bust_connection(dbenv, conn);
  	}
  
  	UNLOCK_MUTEX(db_rep->mutex);
*** repmgr/repmgr_net.c	2007-10-31 10:23:52.000000000 -0700
--- repmgr/repmgr_net.c	2007-10-31 10:23:53.000000000 -0700
***************
*** 63,69 ****
  static void setup_sending_msg
      __P((struct sending_msg *, u_int, const DBT *, const DBT *));
  static int __repmgr_send_internal
!     __P((DB_ENV *, REPMGR_CONNECTION *, struct sending_msg *));
  static int enqueue_msg
      __P((DB_ENV *, REPMGR_CONNECTION *, struct sending_msg *, size_t));
  static int flatten __P((DB_ENV *, struct sending_msg *));
--- 63,69 ----
  static void setup_sending_msg
      __P((struct sending_msg *, u_int, const DBT *, const DBT *));
  static int __repmgr_send_internal
!     __P((DB_ENV *, REPMGR_CONNECTION *, struct sending_msg *, int));
  static int enqueue_msg
      __P((DB_ENV *, REPMGR_CONNECTION *, struct sending_msg *, size_t));
  static int flatten __P((DB_ENV *, struct sending_msg *));
***************
*** 73,85 ****
   * __repmgr_send --
   *	The send function for DB_ENV->rep_set_transport.
   *
-  * !!!
-  * This is only ever called as the replication transport call-back, which means
-  * it's either on one of our message processing threads or an application
-  * thread.  It mustn't be called from the select() thread, because we might call
-  * __repmgr_bust_connection(..., FALSE) here, and that's not allowed in the
-  * select() thread.
-  *
   * PUBLIC: int __repmgr_send __P((DB_ENV *, const DBT *, const DBT *,
   * PUBLIC:     const DB_LSN *, int, u_int32_t));
   */
--- 73,78 ----
***************
*** 126,134 ****
  		}
  
  		conn = site->ref.conn;
  		if ((ret = __repmgr_send_one(dbenv, conn, REPMGR_REP_MESSAGE,
! 		    control, rec)) == DB_REP_UNAVAIL &&
! 		    (t_ret = __repmgr_bust_connection(dbenv, conn, FALSE)) != 0)
  			ret = t_ret;
  		if (ret != 0)
  			goto out;
--- 119,128 ----
  		}
  
  		conn = site->ref.conn;
+ 		/* Pass the "blockable" argument as TRUE. */
  		if ((ret = __repmgr_send_one(dbenv, conn, REPMGR_REP_MESSAGE,
! 		    control, rec, TRUE)) == DB_REP_UNAVAIL &&
! 		    (t_ret = __repmgr_bust_connection(dbenv, conn)) != 0)
  			ret = t_ret;
  		if (ret != 0)
  			goto out;
***************
*** 222,228 ****
  	if (site->state != SITE_CONNECTED)
  		return (NULL);
  
! 	if (F_ISSET(site->ref.conn, CONN_CONNECTING))
  		return (NULL);
  	return (site);
  }
--- 216,222 ----
  	if (site->state != SITE_CONNECTED)
  		return (NULL);
  
! 	if (F_ISSET(site->ref.conn, CONN_CONNECTING|CONN_DEFUNCT))
  		return (NULL);
  	return (site);
  }
***************
*** 235,244 ****
   *
   * !!!
   * Caller must hold dbenv->mutex.
-  *
-  * !!!
-  * Note that this cannot be called from the select() thread, in case we call
-  * __repmgr_bust_connection(..., FALSE).
   */
  static int
  __repmgr_send_broadcast(dbenv, control, rec, nsitesp, npeersp)
--- 229,234 ----
***************
*** 268,281 ****
  		    !IS_VALID_EID(conn->eid))
  			continue;
  
! 		if ((ret = __repmgr_send_internal(dbenv, conn, &msg)) == 0) {
  			site = SITE_FROM_EID(conn->eid);
  			nsites++;
  			if (site->priority > 0)
  				npeers++;
  		} else if (ret == DB_REP_UNAVAIL) {
! 			if ((ret = __repmgr_bust_connection(
! 			     dbenv, conn, FALSE)) != 0)
  				return (ret);
  		} else
  			return (ret);
--- 258,277 ----
  		    !IS_VALID_EID(conn->eid))
  			continue;
  
! 		/*
! 		 * Broadcast messages are either application threads committing
! 		 * transactions, or replication status message that we can
! 		 * afford to lose.  So don't allow blocking for them (pass
! 		 * "blockable" argument as FALSE).
! 		 */
! 		if ((ret = __repmgr_send_internal(dbenv,
! 		    conn, &msg, FALSE)) == 0) {
  			site = SITE_FROM_EID(conn->eid);
  			nsites++;
  			if (site->priority > 0)
  				npeers++;
  		} else if (ret == DB_REP_UNAVAIL) {
! 			if ((ret = __repmgr_bust_connection(dbenv, conn)) != 0)
  				return (ret);
  		} else
  			return (ret);
***************
*** 301,339 ****
   * intersperse writes that are part of two single messages.
   *
   * PUBLIC: int __repmgr_send_one __P((DB_ENV *, REPMGR_CONNECTION *,
!  * PUBLIC:    u_int, const DBT *, const DBT *));
   */
  int
! __repmgr_send_one(dbenv, conn, msg_type, control, rec)
  	DB_ENV *dbenv;
  	REPMGR_CONNECTION *conn;
  	u_int msg_type;
  	const DBT *control, *rec;
  {
  	struct sending_msg msg;
  
  	setup_sending_msg(&msg, msg_type, control, rec);
! 	return (__repmgr_send_internal(dbenv, conn, &msg));
  }
  
  /*
   * Attempts a "best effort" to send a message on the given site.  If there is an
!  * excessive backlog of message already queued on the connection, we simply drop
!  * this message, and still return 0 even in this case.
   */
  static int
! __repmgr_send_internal(dbenv, conn, msg)
  	DB_ENV *dbenv;
  	REPMGR_CONNECTION *conn;
  	struct sending_msg *msg;
  {
! #define	OUT_QUEUE_LIMIT 10	/* arbitrary, for now */
  	REPMGR_IOVECS iovecs;
  	SITE_STRING_BUFFER buffer;
  	int ret;
  	size_t nw;
  	size_t total_written;
  
  	DB_ASSERT(dbenv, !F_ISSET(conn, CONN_CONNECTING));
  	if (!STAILQ_EMPTY(&conn->outbound_queue)) {
  		/*
--- 297,355 ----
   * intersperse writes that are part of two single messages.
   *
   * PUBLIC: int __repmgr_send_one __P((DB_ENV *, REPMGR_CONNECTION *,
!  * PUBLIC:    u_int, const DBT *, const DBT *, int));
   */
  int
! __repmgr_send_one(dbenv, conn, msg_type, control, rec, blockable)
  	DB_ENV *dbenv;
  	REPMGR_CONNECTION *conn;
  	u_int msg_type;
  	const DBT *control, *rec;
+ 	int blockable;
  {
  	struct sending_msg msg;
  
  	setup_sending_msg(&msg, msg_type, control, rec);
! 	return (__repmgr_send_internal(dbenv, conn, &msg, blockable));
  }
  
  /*
   * Attempts a "best effort" to send a message on the given site.  If there is an
!  * excessive backlog of message already queued on the connection, what shall we
!  * do?  If the caller doesn't mind blocking, we'll wait (a limited amount of
!  * time) for the queue to drain.  Otherwise we'll simply drop the message.  This
!  * is always allowed by the replication protocol.  But in the case of a
!  * multi-message response to a request like PAGE_REQ, LOG_REQ or ALL_REQ we
!  * almost always get a flood of messages that instantly fills our queue, so
!  * blocking improves performance (by avoiding the need for the client to
!  * re-request).
!  *
!  * How long shall we wait?  We could of course create a new timeout
!  * configuration type, so that the application could set it directly.  But that
!  * would start to overwhelm the user with too many choices to think about.  We
!  * already have an ACK timeout, which is the user's estimate of how long it
!  * should take to send a message to the client, have it be processed, and return
!  * a message back to us.  We multiply that by the queue size, because that's how
!  * many messages have to be swallowed up by the client before we're able to
!  * start sending again (at least to a rough approximation).
   */
  static int
! __repmgr_send_internal(dbenv, conn, msg, blockable)
  	DB_ENV *dbenv;
  	REPMGR_CONNECTION *conn;
  	struct sending_msg *msg;
+ 	int blockable;
  {
! 	DB_REP *db_rep;
  	REPMGR_IOVECS iovecs;
  	SITE_STRING_BUFFER buffer;
+ 	db_timeout_t drain_to;
  	int ret;
  	size_t nw;
  	size_t total_written;
  
+ 	db_rep = dbenv->rep_handle;
+ 
  	DB_ASSERT(dbenv, !F_ISSET(conn, CONN_CONNECTING));
  	if (!STAILQ_EMPTY(&conn->outbound_queue)) {
  		/*
***************
*** 344,358 ****
  		RPRINT(dbenv, (dbenv, "msg to %s to be queued",
  		    __repmgr_format_eid_loc(dbenv->rep_handle,
  		    conn->eid, buffer)));
  		if (conn->out_queue_length < OUT_QUEUE_LIMIT)
  			return (enqueue_msg(dbenv, conn, msg, 0));
  		else {
  			RPRINT(dbenv, (dbenv, "queue limit exceeded"));
  			STAT(dbenv->rep_handle->
  			    region->mstat.st_msgs_dropped++);
! 			return (0);
  		}
  	}
  
  	/*
  	 * Send as much data to the site as we can, without blocking.  Keep
--- 360,393 ----
  		RPRINT(dbenv, (dbenv, "msg to %s to be queued",
  		    __repmgr_format_eid_loc(dbenv->rep_handle,
  		    conn->eid, buffer)));
+ 		if (conn->out_queue_length >= OUT_QUEUE_LIMIT &&
+ 		    blockable && !F_ISSET(conn, CONN_CONGESTED)) {
+ 			RPRINT(dbenv, (dbenv,
+ 			    "block msg thread, await queue space"));
+ 
+ 			if ((drain_to = db_rep->ack_timeout) == 0)
+ 				drain_to = DB_REPMGR_DEFAULT_ACK_TIMEOUT;
+ 			conn->blockers++;
+ 			ret = __repmgr_await_drain(dbenv,
+ 			    conn, drain_to * OUT_QUEUE_LIMIT);
+ 			conn->blockers--;
+ 			if (db_rep->finished)
+ 				return (DB_TIMEOUT);
+ 			if (ret != 0)
+ 				return (ret);
+ 			if (STAILQ_EMPTY(&conn->outbound_queue))
+ 				goto empty;
+ 		}
  		if (conn->out_queue_length < OUT_QUEUE_LIMIT)
  			return (enqueue_msg(dbenv, conn, msg, 0));
  		else {
  			RPRINT(dbenv, (dbenv, "queue limit exceeded"));
  			STAT(dbenv->rep_handle->
  			    region->mstat.st_msgs_dropped++);
! 			return (blockable ? DB_TIMEOUT : 0);
  		}
  	}
+ empty:
  
  	/*
  	 * Send as much data to the site as we can, without blocking.  Keep
***************
*** 498,521 ****
  
  /*
   * Abandons a connection, to recover from an error.  Upon entry the conn struct
!  * must be on the connections list.
!  *
!  * If the 'do_close' flag is true, we do the whole job; the clean-up includes
!  * removing the struct from the list and freeing all its memory, so upon return
!  * the caller must not refer to it any further.  Otherwise, we merely mark the
!  * connection for clean-up later by the main thread.
   *
   * PUBLIC: int __repmgr_bust_connection __P((DB_ENV *,
!  * PUBLIC:     REPMGR_CONNECTION *, int));
   *
   * !!!
   * Caller holds mutex.
   */
  int
! __repmgr_bust_connection(dbenv, conn, do_close)
  	DB_ENV *dbenv;
  	REPMGR_CONNECTION *conn;
- 	int do_close;
  {
  	DB_REP *db_rep;
  	int connecting, ret, eid;
--- 533,553 ----
  
  /*
   * Abandons a connection, to recover from an error.  Upon entry the conn struct
!  * must be on the connections list.  For now, just mark it as unusable; it will
!  * be fully cleaned up in the top-level select thread, as soon as possible.
   *
   * PUBLIC: int __repmgr_bust_connection __P((DB_ENV *,
!  * PUBLIC:     REPMGR_CONNECTION *));
   *
   * !!!
   * Caller holds mutex.
+  *
+  * Must be idempotent
   */
  int
! __repmgr_bust_connection(dbenv, conn)
  	DB_ENV *dbenv;
  	REPMGR_CONNECTION *conn;
  {
  	DB_REP *db_rep;
  	int connecting, ret, eid;
***************
*** 526,537 ****
  	DB_ASSERT(dbenv, !TAILQ_EMPTY(&db_rep->connections));
  	eid = conn->eid;
  	connecting = F_ISSET(conn, CONN_CONNECTING);
! 	if (do_close)
! 		__repmgr_cleanup_connection(dbenv, conn);
! 	else {
! 		F_SET(conn, CONN_DEFUNCT);
! 		conn->eid = -1;
! 	}
  
  	/*
  	 * When we first accepted the incoming connection, we set conn->eid to
--- 558,566 ----
  	DB_ASSERT(dbenv, !TAILQ_EMPTY(&db_rep->connections));
  	eid = conn->eid;
  	connecting = F_ISSET(conn, CONN_CONNECTING);
! 
! 	F_SET(conn, CONN_DEFUNCT);
! 	conn->eid = -1;
  
  	/*
  	 * When we first accepted the incoming connection, we set conn->eid to
***************
*** 557,563 ****
  			    dbenv, ELECT_FAILURE_ELECTION)) != 0)
  				return (ret);
  		}
! 	} else if (!do_close) {
  		/*
  		 * One way or another, make sure the main thread is poked, so
  		 * that we do the deferred clean-up.
--- 586,592 ----
  			    dbenv, ELECT_FAILURE_ELECTION)) != 0)
  				return (ret);
  		}
! 	} else {
  		/*
  		 * One way or another, make sure the main thread is poked, so
  		 * that we do the deferred clean-up.
***************
*** 568,577 ****
  }
  
  /*
!  * PUBLIC: void __repmgr_cleanup_connection
   * PUBLIC:    __P((DB_ENV *, REPMGR_CONNECTION *));
   */
! void
  __repmgr_cleanup_connection(dbenv, conn)
  	DB_ENV *dbenv;
  	REPMGR_CONNECTION *conn;
--- 597,610 ----
  }
  
  /*
!  * PUBLIC: int __repmgr_cleanup_connection
   * PUBLIC:    __P((DB_ENV *, REPMGR_CONNECTION *));
+  *
+  * !!!
+  * Idempotent.  This can be called repeatedly as blocking message threads (of
+  * which there could be multiples) wake up in case of error on the connection.
   */
! int
  __repmgr_cleanup_connection(dbenv, conn)
  	DB_ENV *dbenv;
  	REPMGR_CONNECTION *conn;
***************
*** 580,596 ****
  	QUEUED_OUTPUT *out;
  	REPMGR_FLAT *msg;
  	DBT *dbt;
  
  	db_rep = dbenv->rep_handle;
  
! 	TAILQ_REMOVE(&db_rep->connections, conn, entries);
  	if (conn->fd != INVALID_SOCKET) {
! 		(void)closesocket(conn->fd);
  #ifdef DB_WIN32
! 		(void)WSACloseEvent(conn->event_object);
  #endif
  	}
  
  	/*
  	 * Deallocate any input and output buffers we may have.
  	 */
--- 613,643 ----
  	QUEUED_OUTPUT *out;
  	REPMGR_FLAT *msg;
  	DBT *dbt;
+ 	int ret;
  
  	db_rep = dbenv->rep_handle;
  
! 	DB_ASSERT(dbenv, F_ISSET(conn, CONN_DEFUNCT) || db_rep->finished);
! 	
  	if (conn->fd != INVALID_SOCKET) {
! 		ret = closesocket(conn->fd);
! 		conn->fd = INVALID_SOCKET;
! 		if (ret == SOCKET_ERROR) {
! 			ret = net_errno;
! 			__db_err(dbenv, ret, "closing socket");
! 		}
  #ifdef DB_WIN32
! 		if (!WSACloseEvent(conn->event_object) && ret != 0)
! 			ret = net_errno;
  #endif
+ 		if (ret != 0)
+ 			return (ret);
  	}
  
+ 	if (conn->blockers > 0)
+ 		return (__repmgr_signal(&conn->drained));
+ 
+ 	TAILQ_REMOVE(&db_rep->connections, conn, entries);
  	/*
  	 * Deallocate any input and output buffers we may have.
  	 */
***************
*** 614,620 ****
--- 661,669 ----
  		__os_free(dbenv, out);
  	}
  
+ 	ret = __repmgr_free_cond(&conn->drained);
  	__os_free(dbenv, conn);
+ 	return (ret);
  }
  
  static int
***************
*** 1063,1069 ****
  
  	while (!TAILQ_EMPTY(&db_rep->connections)) {
  		conn = TAILQ_FIRST(&db_rep->connections);
! 		__repmgr_cleanup_connection(dbenv, conn);
  	}
  
  	for (i = 0; i < db_rep->site_cnt; i++) {
--- 1112,1118 ----
  
  	while (!TAILQ_EMPTY(&db_rep->connections)) {
  		conn = TAILQ_FIRST(&db_rep->connections);
! 		(void)__repmgr_cleanup_connection(dbenv, conn);
  	}
  
  	for (i = 0; i < db_rep->site_cnt; i++) {
*** repmgr/repmgr_posix.c	2007-10-31 10:23:52.000000000 -0700
--- repmgr/repmgr_posix.c	2007-10-31 10:23:53.000000000 -0700
***************
*** 21,26 ****
--- 21,28 ----
  size_t __repmgr_guesstimated_max = (128 * 1024);
  #endif
  
+ static int __repmgr_conn_work __P((DB_ENV *,
+     REPMGR_CONNECTION *, fd_set *, fd_set *, int));
  static int finish_connecting __P((DB_ENV *, REPMGR_CONNECTION *));
  
  /*
***************
*** 189,194 ****
--- 191,284 ----
  }
  
  /*
+  * PUBLIC: int __repmgr_await_drain __P((DB_ENV *,
+  * PUBLIC:    REPMGR_CONNECTION *, db_timeout_t));
+  *
+  * Waits for space to become available on the connection's output queue.
+  * Various ways we can exit:
+  *
+  * 1. queue becomes non-full
+  * 2. exceed time limit
+  * 3. connection becomes defunct (due to error in another thread)
+  * 4. repmgr is shutting down
+  * 5. any unexpected system resource failure
+  *
+  * In cases #3 and #5 we return an error code.  Caller is responsible for
+  * distinguishing the remaining cases if desired.
+  * 
+  * !!!
+  * Caller must hold repmgr->mutex.
+  */
+ int
+ __repmgr_await_drain(dbenv, conn, timeout)
+ 	DB_ENV *dbenv;
+ 	REPMGR_CONNECTION *conn;
+ 	db_timeout_t timeout;
+ {
+ 	DB_REP *db_rep;
+ 	struct timespec deadline;
+ 	int ret;
+ 
+ 	db_rep = dbenv->rep_handle;
+ 
+ 	__repmgr_compute_wait_deadline(dbenv, &deadline, timeout);
+ 
+ 	ret = 0;
+ 	while (conn->out_queue_length >= OUT_QUEUE_LIMIT) {
+ 		ret = pthread_cond_timedwait(&conn->drained,
+ 		    &db_rep->mutex, &deadline);
+ 		switch (ret) {
+ 		case 0:
+ 			if (db_rep->finished)
+ 				goto out; /* #4. */
+ 			/*
+ 			 * Another thread could have stumbled into an error on
+ 			 * the socket while we were waiting.
+ 			 */
+ 			if (F_ISSET(conn, CONN_DEFUNCT)) {
+ 				ret = DB_REP_UNAVAIL; /* #3. */
+ 				goto out;
+ 			}
+ 			break;
+ 		case ETIMEDOUT:
+ 			F_SET(conn, CONN_CONGESTED);
+ 			ret = 0;
+ 			goto out; /* #2. */
+ 		default:
+ 			goto out; /* #5. */
+ 		}
+ 	}
+ 	/* #1. */
+ 
+ out:
+ 	return (ret);
+ }
+ 
+ /*
+  * PUBLIC: int __repmgr_alloc_cond __P((cond_var_t *));
+  *
+  * Initialize a condition variable (in allocated space).
+  */
+ int
+ __repmgr_alloc_cond(c)
+ 	cond_var_t *c;
+ {
+ 	return (pthread_cond_init(c, NULL));
+ }
+ 
+ /*
+  * PUBLIC: int __repmgr_free_cond __P((cond_var_t *));
+  *
+  * Clean up a previously initialized condition variable.
+  */
+ int
+ __repmgr_free_cond(c)
+ 	cond_var_t *c;
+ {
+ 	return (pthread_cond_destroy(c));
+ }
+ 
+ /*
   * PUBLIC: int __repmgr_init_sync __P((DB_ENV *, DB_REP *));
   *
   * Allocate/initialize all data necessary for thread synchronization.  This
***************
*** 443,449 ****
  	REPMGR_RETRY *retry;
  	db_timespec timeout;
  	fd_set reads, writes;
! 	int ret, flow_control, maxfd, nready;
  	u_int8_t buf[10];	/* arbitrary size */
  
  	flow_control = FALSE;
--- 533,539 ----
  	REPMGR_RETRY *retry;
  	db_timespec timeout;
  	fd_set reads, writes;
! 	int ret, flow_control, maxfd;
  	u_int8_t buf[10];	/* arbitrary size */
  
  	flow_control = FALSE;
***************
*** 477,482 ****
--- 567,575 ----
  		 * each one.
  		 */
  		TAILQ_FOREACH(conn, &db_rep->connections, entries) {
+ 			if (F_ISSET(conn, CONN_DEFUNCT))
+ 				continue;
+ 			
  			if (F_ISSET(conn, CONN_CONNECTING)) {
  				FD_SET((u_int)conn->fd, &reads);
  				FD_SET((u_int)conn->fd, &writes);
***************
*** 533,616 ****
  				return (ret);
  			}
  		}
- 		nready = ret;
- 
  		LOCK_MUTEX(db_rep->mutex);
  
- 		/*
- 		 * The first priority thing we must do is to clean up any
- 		 * pending defunct connections.  Otherwise, if they have any
- 		 * lingering pending input, we get very confused if we try to
- 		 * process it.
- 		 *
- 		 * The TAILQ_FOREACH macro would be suitable here, except that
- 		 * it doesn't allow unlinking the current element, which is
- 		 * needed for cleanup_connection.
- 		 */
- 		for (conn = TAILQ_FIRST(&db_rep->connections);
- 		     conn != NULL;
- 		     conn = next) {
- 			next = TAILQ_NEXT(conn, entries);
- 			if (F_ISSET(conn, CONN_DEFUNCT))
- 				__repmgr_cleanup_connection(dbenv, conn);
- 		}
- 
  		if ((ret = __repmgr_retry_connections(dbenv)) != 0)
  			goto out;
- 		if (nready == 0)
- 			continue;
  
  		/*
! 		 * Traverse the linked list.  (Again, like TAILQ_FOREACH, except
! 		 * that we need the ability to unlink an element along the way.)
  		 */
  		for (conn = TAILQ_FIRST(&db_rep->connections);
  		     conn != NULL;
  		     conn = next) {
  			next = TAILQ_NEXT(conn, entries);
! 			if (F_ISSET(conn, CONN_CONNECTING)) {
! 				if (FD_ISSET((u_int)conn->fd, &reads) ||
! 				    FD_ISSET((u_int)conn->fd, &writes)) {
! 					if ((ret = finish_connecting(dbenv,
! 					    conn)) == DB_REP_UNAVAIL) {
! 						if ((ret =
! 						    __repmgr_bust_connection(
! 						    dbenv, conn, TRUE)) != 0)
! 							goto out;
! 					} else if (ret != 0)
! 						goto out;
! 				}
! 				continue;
! 			}
! 
! 			/*
! 			 * Here, the site is connected, and the FD_SET's are
! 			 * valid.
! 			 */
! 			if (FD_ISSET((u_int)conn->fd, &writes)) {
! 				if ((ret = __repmgr_write_some(
! 				    dbenv, conn)) == DB_REP_UNAVAIL) {
! 					if ((ret =
! 					    __repmgr_bust_connection(dbenv,
! 					    conn, TRUE)) != 0)
! 						goto out;
! 					continue;
! 				} else if (ret != 0)
! 					goto out;
! 			}
! 
! 			if (!flow_control &&
! 			    FD_ISSET((u_int)conn->fd, &reads)) {
! 				if ((ret = __repmgr_read_from_site(dbenv, conn))
! 				    == DB_REP_UNAVAIL) {
! 					if ((ret =
! 					    __repmgr_bust_connection(dbenv,
! 					    conn, TRUE)) != 0)
! 						goto out;
! 					continue;
! 				} else if (ret != 0)
! 					goto out;
! 			}
  		}
  
  		/*
--- 626,650 ----
  				return (ret);
  			}
  		}
  		LOCK_MUTEX(db_rep->mutex);
  
  		if ((ret = __repmgr_retry_connections(dbenv)) != 0)
  			goto out;
  
  		/*
! 		 * Examine each connection, to see what work needs to be done.
! 		 * 
! 		 * The TAILQ_FOREACH macro would be suitable here, except that
! 		 * it doesn't allow unlinking the current element, which is
! 		 * needed for cleanup_connection.
  		 */
  		for (conn = TAILQ_FIRST(&db_rep->connections);
  		     conn != NULL;
  		     conn = next) {
  			next = TAILQ_NEXT(conn, entries);
! 			if ((ret = __repmgr_conn_work(dbenv,
! 			    conn, &reads, &writes, flow_control)) != 0)
! 				goto out;
  		}
  
  		/*
***************
*** 637,642 ****
--- 671,719 ----
  }
  
  static int
+ __repmgr_conn_work(dbenv, conn, reads, writes, flow_control)
+ 	DB_ENV *dbenv;
+ 	REPMGR_CONNECTION *conn;
+ 	fd_set *reads, *writes;
+ 	int flow_control;
+ {
+ 	int ret;
+ 	u_int fd;
+ 
+ 	if (F_ISSET(conn, CONN_DEFUNCT)) {
+ 		/*
+ 		 * Deferred clean-up, from an error that happened in another
+ 		 * thread, while we were sleeping in select().
+ 		*/
+ 		return (__repmgr_cleanup_connection(dbenv, conn));
+ 	}
+ 
+ 	ret = 0;
+ 	fd = (u_int)conn->fd;
+ 	
+ 	if (F_ISSET(conn, CONN_CONNECTING)) {
+ 		if (FD_ISSET(fd, reads) || FD_ISSET(fd, writes))
+ 			ret = finish_connecting(dbenv, conn);
+ 	} else {
+ 		/*
+ 		 * Here, the site is connected, and the FD_SET's are valid.
+ 		 */
+ 		if (FD_ISSET(fd, writes))
+ 			ret = __repmgr_write_some(dbenv, conn);
+ 				
+ 		if (ret == 0 && !flow_control && FD_ISSET(fd, reads))
+ 			ret = __repmgr_read_from_site(dbenv, conn);
+ 	}
+ 
+ 	if (ret == DB_REP_UNAVAIL) {
+ 		if ((ret = __repmgr_bust_connection(dbenv, conn)) != 0)
+ 			return (ret);
+ 		ret = __repmgr_cleanup_connection(dbenv, conn);
+ 	}
+ 	return (ret);
+ }
+ 
+ static int
  finish_connecting(dbenv, conn)
  	DB_ENV *dbenv;
  	REPMGR_CONNECTION *conn;
***************
*** 657,662 ****
--- 734,740 ----
  		goto err_rpt;
  	}
  
+ 	DB_ASSERT(dbenv, F_ISSET(conn, CONN_CONNECTING));
  	F_CLR(conn, CONN_CONNECTING);
  	return (__repmgr_send_handshake(dbenv, conn));
  
***************
*** 671,690 ****
  	    "connecting to %s", __repmgr_format_site_loc(site, buffer));
  
  	/* If we've exhausted the list of possible addresses, give up. */
! 	if (ADDR_LIST_NEXT(&site->net_addr) == NULL)
  		return (DB_REP_UNAVAIL);
  
  	/*
  	 * This is just like a little mini-"bust_connection", except that we
  	 * don't reschedule for later, 'cuz we're just about to try again right
! 	 * now.
  	 *
  	 * !!!
  	 * Which means this must only be called on the select() thread, since
  	 * only there are we allowed to actually close a connection.
  	 */
  	DB_ASSERT(dbenv, !TAILQ_EMPTY(&db_rep->connections));
! 	__repmgr_cleanup_connection(dbenv, conn);
  	ret = __repmgr_connect_site(dbenv, eid);
  	DB_ASSERT(dbenv, ret != DB_REP_UNAVAIL);
  	return (ret);
--- 749,773 ----
  	    "connecting to %s", __repmgr_format_site_loc(site, buffer));
  
  	/* If we've exhausted the list of possible addresses, give up. */
! 	if (ADDR_LIST_NEXT(&site->net_addr) == NULL) {
! 		STAT(db_rep->region->mstat.st_connect_fail++);
  		return (DB_REP_UNAVAIL);
+ 	}
  
  	/*
  	 * This is just like a little mini-"bust_connection", except that we
  	 * don't reschedule for later, 'cuz we're just about to try again right
! 	 * now.  (Note that we don't have to worry about message threads
! 	 * blocking on a full output queue: that can't happen when we're only
! 	 * just connecting.)
  	 *
  	 * !!!
  	 * Which means this must only be called on the select() thread, since
  	 * only there are we allowed to actually close a connection.
  	 */
  	DB_ASSERT(dbenv, !TAILQ_EMPTY(&db_rep->connections));
! 	if ((ret = __repmgr_cleanup_connection(dbenv, conn)) != 0)
! 		return (ret);
  	ret = __repmgr_connect_site(dbenv, eid);
  	DB_ASSERT(dbenv, ret != DB_REP_UNAVAIL);
  	return (ret);
*** repmgr/repmgr_sel.c	2007-10-31 10:23:52.000000000 -0700
--- repmgr/repmgr_sel.c	2007-10-31 10:23:53.000000000 -0700
***************
*** 36,45 ****
  
  /*
   * PUBLIC: int __repmgr_accept __P((DB_ENV *));
-  *
-  * !!!
-  * Only ever called in the select() thread, since we may call
-  * __repmgr_bust_connection(..., TRUE).
   */
  int
  __repmgr_accept(dbenv)
--- 36,41 ----
***************
*** 133,139 ****
  	case 0:
  		return (0);
  	case DB_REP_UNAVAIL:
! 		return (__repmgr_bust_connection(dbenv, conn, TRUE));
  	default:
  		return (ret);
  	}
--- 129,135 ----
  	case 0:
  		return (0);
  	case DB_REP_UNAVAIL:
! 		return (__repmgr_bust_connection(dbenv, conn));
  	default:
  		return (ret);
  	}
***************
*** 254,263 ****
   * starting with the "current" element of its address list and trying as many
   * addresses as necessary until the list is exhausted.
   *
-  * !!!
-  * Only ever called in the select() thread, since we may call
-  * __repmgr_bust_connection(..., TRUE).
-  *
   * PUBLIC: int __repmgr_connect_site __P((DB_ENV *, u_int eid));
   */
  int
--- 250,255 ----
***************
*** 332,338 ****
  		case 0:
  			break;
  		case DB_REP_UNAVAIL:
! 			return (__repmgr_bust_connection(dbenv, con, TRUE));
  		default:
  			return (ret);
  		}
--- 324,330 ----
  		case 0:
  			break;
  		case DB_REP_UNAVAIL:
! 			return (__repmgr_bust_connection(dbenv, con));
  		default:
  			return (ret);
  		}
***************
*** 437,443 ****
  
  	DB_SET_DBT(rec, my_addr->host, strlen(my_addr->host) + 1);
  
! 	return (__repmgr_send_one(dbenv, conn, REPMGR_HANDSHAKE, &cntrl, &rec));
  }
  
  /*
--- 429,443 ----
  
  	DB_SET_DBT(rec, my_addr->host, strlen(my_addr->host) + 1);
  
! 	/*
! 	 * It would of course be disastrous to block the select() thread, so
! 	 * pass the "blockable" argument as FALSE.  Fortunately blocking should
! 	 * never be necessary here, because the hand-shake is always the first
! 	 * thing we send.  Which is a good thing, because it would be almost as
! 	 * disastrous if we allowed ourselves to drop a handshake.
! 	 */
! 	return (__repmgr_send_one(dbenv,
! 	    conn, REPMGR_HANDSHAKE, &cntrl, &rec, FALSE));
  }
  
  /*
***************
*** 854,859 ****
--- 854,872 ----
  			conn->out_queue_length--;
  			if (--msg->ref_count <= 0)
  				__os_free(dbenv, msg);
+ 
+ 			/*
+ 			 * We've achieved enough movement to free up at least
+ 			 * one space in the outgoing queue.  Wake any message
+ 			 * threads that may be waiting for space.  Clear the
+ 			 * CONGESTED status so that when the queue reaches the
+ 			 * high-water mark again, the filling thread will be
+ 			 * allowed to try waiting again.
+ 			 */
+ 			F_CLR(conn, CONN_CONGESTED);
+ 			if (conn->blockers > 0 &&
+ 			    (ret = __repmgr_signal(&conn->drained)) != 0)
+ 				return (ret);
  		}
  	}
  
*** repmgr/repmgr_util.c	2007-10-31 10:23:52.000000000 -0700
--- repmgr/repmgr_util.c	2007-10-31 10:23:53.000000000 -0700
***************
*** 103,108 ****
--- 103,113 ----
  	db_rep = dbenv->rep_handle;
  	if ((ret = __os_malloc(dbenv, sizeof(REPMGR_CONNECTION), &c)) != 0)
  		return (ret);
+ 	if ((ret = __repmgr_alloc_cond(&c->drained)) != 0) {
+ 		__os_free(dbenv, c);
+ 		return (ret);
+ 	}
+ 	c->blockers = 0;
  
  	c->fd = s;
  	c->flags = flags;
*** repmgr/repmgr_windows.c	2007-10-31 10:23:52.000000000 -0700
--- repmgr/repmgr_windows.c	2007-10-31 10:23:53.000000000 -0700
***************
*** 11,16 ****
--- 11,19 ----
  #define	__INCLUDE_NETWORKING	1
  #include "db_int.h"
  
+ /* Convert time-out from microseconds to milliseconds, rounding up. */
+ #define	DB_TIMEOUT_TO_WINDOWS_TIMEOUT(t) (((t) + (US_PER_MS - 1)) / US_PER_MS)
+ 
  typedef struct __ack_waiter {
  	HANDLE event;
  	const DB_LSN *lsnp;
***************
*** 120,136 ****
  {
  	DB_REP *db_rep;
  	ACK_WAITER *me;
! 	DWORD ret;
! 	DWORD timeout;
  
  	db_rep = dbenv->rep_handle;
  
  	if ((ret = allocate_wait_slot(dbenv, &me)) != 0)
  		goto err;
  
- 	/* convert time-out from microseconds to milliseconds, rounding up */
  	timeout = db_rep->ack_timeout > 0 ?
! 	    ((db_rep->ack_timeout + (US_PER_MS - 1)) / US_PER_MS) : INFINITE;
  	me->lsnp = lsnp;
  	if ((ret = SignalObjectAndWait(db_rep->mutex, me->event, timeout,
  	    FALSE)) == WAIT_FAILED) {
--- 123,137 ----
  {
  	DB_REP *db_rep;
  	ACK_WAITER *me;
! 	DWORD ret, timeout;
  
  	db_rep = dbenv->rep_handle;
  
  	if ((ret = allocate_wait_slot(dbenv, &me)) != 0)
  		goto err;
  
  	timeout = db_rep->ack_timeout > 0 ?
! 	    DB_TIMEOUT_TO_WINDOWS_TIMEOUT(db_rep->ack_timeout) : INFINITE;
  	me->lsnp = lsnp;
  	if ((ret = SignalObjectAndWait(db_rep->mutex, me->event, timeout,
  	    FALSE)) == WAIT_FAILED) {
***************
*** 211,216 ****
--- 212,296 ----
  	db_rep->waiters->first_free = slot;
  }
  
+ /* (See requirements described in repmgr_posix.c.) */
+ int
+ __repmgr_await_drain(dbenv, conn, timeout)
+ 	DB_ENV *dbenv;
+ 	REPMGR_CONNECTION *conn;
+ 	db_timeout_t timeout;
+ {
+ 	DB_REP *db_rep;
+ 	db_timespec deadline, delta, now;
+ 	db_timeout_t t;
+ 	DWORD duration, ret;
+ 	int round_up;
+ 	
+ 	db_rep = dbenv->rep_handle;
+ 	
+ 	__os_gettime(dbenv, &deadline);
+ 	DB_TIMEOUT_TO_TIMESPEC(timeout, &delta);
+ 	timespecadd(&deadline, &delta);
+ 
+ 	while (conn->out_queue_length >= OUT_QUEUE_LIMIT) {
+ 		if (!ResetEvent(conn->drained))
+ 			return (GetLastError());
+ 
+ 		/* How long until the deadline? */
+ 		__os_gettime(dbenv, &now);
+ 		if (timespeccmp(&now, &deadline, >=)) {
+ 			F_SET(conn, CONN_CONGESTED);
+ 			return (0);
+ 		}
+ 		delta = deadline;
+ 		timespecsub(&delta, &now);
+ 		round_up = TRUE;
+ 		DB_TIMESPEC_TO_TIMEOUT(t, &delta, round_up);
+ 		duration = DB_TIMEOUT_TO_WINDOWS_TIMEOUT(t);
+ 
+ 		ret = SignalObjectAndWait(db_rep->mutex,
+ 		    conn->drained, duration, FALSE);
+ 		LOCK_MUTEX(db_rep->mutex);
+ 		if (ret == WAIT_FAILED)
+ 			return (GetLastError());
+ 		else if (ret == WAIT_TIMEOUT) {
+ 			F_SET(conn, CONN_CONGESTED);
+ 			return (0);
+ 		} else
+ 			DB_ASSERT(dbenv, ret == WAIT_OBJECT_0);
+ 		
+ 		if (db_rep->finished)
+ 			return (0);
+ 		if (F_ISSET(conn, CONN_DEFUNCT))
+ 			return (DB_REP_UNAVAIL);
+ 	}
+ 	return (0);
+ }
+ 
+ /*
+  * Creates a manual reset event, which is usually our best choice when we may
+  * have multiple threads waiting on a single event.
+  */
+ int
+ __repmgr_alloc_cond(c)
+ 	cond_var_t *c;
+ {
+ 	HANDLE event;
+ 
+ 	if ((event = CreateEvent(NULL, TRUE, FALSE, NULL)) == NULL)
+ 		return (GetLastError());
+ 	*c = event;
+ 	return (0);
+ }
+ 
+ int
+ __repmgr_free_cond(c)
+ 	cond_var_t *c;
+ {
+ 	if (CloseHandle(*c))
+ 		return (0);
+ 	return (GetLastError());
+ }
+ 
  /*
   * Make resource allocation an all-or-nothing affair, outside of this and the
   * close_sync function.  db_rep->waiters should be non-NULL iff all of these
***************
*** 488,493 ****
--- 568,576 ----
  		 * don't hurt anything flow-control-wise.
  		 */
  		TAILQ_FOREACH(conn, &db_rep->connections, entries) {
+ 			if (F_ISSET(conn, CONN_DEFUNCT))
+ 				continue;
+ 
  			if (F_ISSET(conn, CONN_CONNECTING) ||
  			    !STAILQ_EMPTY(&conn->outbound_queue) ||
  			    (!flow_control || !IS_VALID_EID(conn->eid))) {
***************
*** 534,541 ****
  		     conn != NULL;
  		     conn = next) {
  			next = TAILQ_NEXT(conn, entries);
! 			if (F_ISSET(conn, CONN_DEFUNCT))
! 				__repmgr_cleanup_connection(dbenv, conn);
  		}
  
  		/*
--- 617,626 ----
  		     conn != NULL;
  		     conn = next) {
  			next = TAILQ_NEXT(conn, entries);
! 			if (F_ISSET(conn, CONN_DEFUNCT) &&
! 			    (ret = __repmgr_cleanup_connection(dbenv,
! 			    conn)) != 0)
! 				goto unlock;
  		}
  
  		/*
***************
*** 587,597 ****
  	return (ret);
  }
  
- /*
-  * !!!
-  * Only ever called on the select() thread, since we may call
-  * __repmgr_bust_connection(..., TRUE).
-  */
  static int
  handle_completion(dbenv, conn)
  	DB_ENV *dbenv;
--- 672,677 ----
***************
*** 651,660 ****
  		}
  	}
  
! 	return (0);
! 
! err:	if (ret == DB_REP_UNAVAIL)
! 		return (__repmgr_bust_connection(dbenv, conn, TRUE));
  	return (ret);
  }
  
--- 731,742 ----
  		}
  	}
  
! err:
! 	if (ret == DB_REP_UNAVAIL) {
! 		if ((ret = __repmgr_bust_connection(dbenv, conn)) != 0)
! 			return (ret);
! 		ret = __repmgr_cleanup_connection(dbenv, conn);
! 	}
  	return (ret);
  }
  
***************
*** 708,714 ****
  	}
  
  	DB_ASSERT(dbenv, !TAILQ_EMPTY(&db_rep->connections));
! 	__repmgr_cleanup_connection(dbenv, conn);
  	ret = __repmgr_connect_site(dbenv, eid);
  	DB_ASSERT(dbenv, ret != DB_REP_UNAVAIL);
  	return (ret);
--- 790,797 ----
  	}
  
  	DB_ASSERT(dbenv, !TAILQ_EMPTY(&db_rep->connections));
! 	if ((ret = __repmgr_cleanup_connection(dbenv, conn)) != 0)
! 		return (ret);
  	ret = __repmgr_connect_site(dbenv, eid);
  	DB_ASSERT(dbenv, ret != DB_REP_UNAVAIL);
  	return (ret);