2
0

Add non-blocking IO

This eliminates an edge case that can cause a deadlock and is a
prerequisite to cheaply testing connection liveness and to recoving a
connection after a timeout.

https://github.com/jackc/pgconn/issues/27

Squashed commit of the following:

commit 0d7b0dddea1575e9fd72592665badb8cbdd581cc
Author: Jack Christensen <jack@jackchristensen.com>
Date:   Sat Jun 25 13:15:05 2022 -0500

    Add test for non-blocking IO preventing deadlock

commit 79d68d23d38bb03ddb8bf13cb45792430eaf959a
Author: Jack Christensen <jack@jackchristensen.com>
Date:   Sat Jun 18 18:23:24 2022 -0500

    Release CopyFrom buf when done

commit 95a43139c7b0b7557898c4480e5b3e42417ee3c0
Author: Jack Christensen <jack@jackchristensen.com>
Date:   Sat Jun 18 18:22:32 2022 -0500

    Avoid allocations with non-blocking write

commit 6b63ceee076794bc4380495a55dd414dbbd08a43
Author: Jack Christensen <jack@jackchristensen.com>
Date:   Sat Jun 18 17:46:49 2022 -0500

    Simplify iobufpool usage

commit 60ecdda02e5a24c894df4f58d31c485b90de5d5b
Author: Jack Christensen <jack@jackchristensen.com>
Date:   Sat Jun 18 11:51:59 2022 -0500

    Add true non-blocking IO

commit 7dd26a34a182d4aacaed3bf8c09f9cc48a7b6156
Author: Jack Christensen <jack@jackchristensen.com>
Date:   Sat Jun 4 20:28:23 2022 -0500

    Fix block when reading more than buffered

commit afa702213f1b6d24c976406448301b2be53b7f70
Author: Jack Christensen <jack@jackchristensen.com>
Date:   Sat Jun 4 20:10:23 2022 -0500

    More TLS support

commit 51655bf8f40321d5f89bc3c02dd55fba0ac6aa49
Author: Jack Christensen <jack@jackchristensen.com>
Date:   Sat Jun 4 17:46:00 2022 -0500

    Steps toward TLS

commit 2b80beb1ed75f0f58db8188b87753dbc26b62098
Author: Jack Christensen <jack@jackchristensen.com>
Date:   Sat Jun 4 13:06:29 2022 -0500

    Litle more TLS support

commit 765b2c6e7b034ff6ffab3974579fd6ee7add593b
Author: Jack Christensen <jack@jackchristensen.com>
Date:   Sat Jun 4 12:29:30 2022 -0500

    Add testing of TLS

commit 5b64432afbed9224f9512cc46624c88e7ebec625
Author: Jack Christensen <jack@jackchristensen.com>
Date:   Sat Jun 4 09:48:19 2022 -0500

    Introduce testVariants in prep for TLS

commit ecebd7b103d4a9125c61e83f3651b950658b0b84
Author: Jack Christensen <jack@jackchristensen.com>
Date:   Sat Jun 4 09:32:14 2022 -0500

    Handle and test read of previously buffered data

commit 09c64d8cf3ca5be1a31bef46bf78fa5cb9fae831
Author: Jack Christensen <jack@jackchristensen.com>
Date:   Sat Jun 4 09:04:48 2022 -0500

    Rename nbbconn to nbconn

commit 73398bc67a7b7bd1aa044fb9b0546f4198ef92d2
Author: Jack Christensen <jack@jackchristensen.com>
Date:   Sat Jun 4 08:59:53 2022 -0500

    Remove backup files

commit f1df39a29d23ae4e5175b92c69697f2bf9b4e112
Author: Jack Christensen <jack@jackchristensen.com>
Date:   Sat Jun 4 08:58:05 2022 -0500

    Initial passing tests

commit ea3cdab234343fc9761d9b7966c5346179cd1b01
Author: Jack Christensen <jack@jackchristensen.com>
Date:   Sat Jun 4 08:38:57 2022 -0500

    Fix connect timeout

commit ca22396789d120ff556f9704f4470268fbc8c0d8
Author: Jack Christensen <jack@jackchristensen.com>
Date:   Thu Jun 2 19:32:55 2022 -0500

    wip

commit 2e7b46d5d7454daf0859dd48f8a8e190995164c5
Author: Jack Christensen <jack@jackchristensen.com>
Date:   Mon May 30 08:32:43 2022 -0500

    Update comments

commit 7d04dc5caa80cb147929b6f65bab60a27baaff89
Author: Jack Christensen <jack@jackchristensen.com>
Date:   Sat May 28 19:43:23 2022 -0500

    Fix broken test

commit bf1edc77d70465b4097a59c08c581033d2033ac6
Author: Jack Christensen <jack@jackchristensen.com>
Date:   Sat May 28 19:40:33 2022 -0500

    fixed putting wrong size bufs

commit 1f7a855b2e4d1e14f85ac5f5683e2b93db0a4bd9
Author: Jack Christensen <jack@jackchristensen.com>
Date:   Sat May 28 18:13:47 2022 -0500

    initial not quite working non-blocking conn
This commit is contained in:
Jack Christensen
2022-06-25 13:15:31 -05:00
parent c0a4d1b9ce
commit 811d855a35
8 changed files with 1333 additions and 151 deletions
+74 -114
View File
@@ -13,9 +13,10 @@ import (
"net"
"strconv"
"strings"
"sync"
"time"
"github.com/jackc/pgx/v5/internal/iobufpool"
"github.com/jackc/pgx/v5/internal/nbconn"
"github.com/jackc/pgx/v5/internal/pgio"
"github.com/jackc/pgx/v5/pgconn/internal/ctxwatch"
"github.com/jackc/pgx/v5/pgproto3"
@@ -75,11 +76,6 @@ type PgConn struct {
status byte // One of connStatus* constants
bufferingReceive bool
bufferingReceiveMux sync.Mutex
bufferingReceiveMsg pgproto3.BackendMessage
bufferingReceiveErr error
peekedMsg pgproto3.BackendMessage
// Reusable / preallocated resources
@@ -234,13 +230,14 @@ func connect(ctx context.Context, config *Config, fallbackConfig *FallbackConfig
}
return nil, &connectError{config: config, msg: "dial error", err: err}
}
netConn = nbconn.NewNetConn(netConn, false)
pgConn.conn = netConn
pgConn.contextWatcher = newContextWatcher(netConn)
pgConn.contextWatcher.Watch(ctx)
if fallbackConfig.TLSConfig != nil {
tlsConn, err := startTLS(netConn, fallbackConfig.TLSConfig)
tlsConn, err := startTLS(netConn.(*nbconn.NetConn), fallbackConfig.TLSConfig)
pgConn.contextWatcher.Unwatch() // Always unwatch `netConn` after TLS.
if err != nil {
netConn.Close()
@@ -356,7 +353,7 @@ func newContextWatcher(conn net.Conn) *ctxwatch.ContextWatcher {
)
}
func startTLS(conn net.Conn, tlsConfig *tls.Config) (net.Conn, error) {
func startTLS(conn *nbconn.NetConn, tlsConfig *tls.Config) (net.Conn, error) {
err := binary.Write(conn, binary.BigEndian, []int32{8, 80877103})
if err != nil {
return nil, err
@@ -371,7 +368,12 @@ func startTLS(conn net.Conn, tlsConfig *tls.Config) (net.Conn, error) {
return nil, errors.New("server refused TLS connection")
}
return tls.Client(conn, tlsConfig), nil
tlsConn, err := nbconn.TLSClient(conn, tlsConfig)
if err != nil {
return nil, err
}
return tlsConn, nil
}
func (pgConn *PgConn) txPasswordMessage(password string) (err error) {
@@ -385,24 +387,6 @@ func hexMD5(s string) string {
return hex.EncodeToString(hash.Sum(nil))
}
func (pgConn *PgConn) signalMessage() chan struct{} {
if pgConn.bufferingReceive {
panic("BUG: signalMessage when already in progress")
}
pgConn.bufferingReceive = true
pgConn.bufferingReceiveMux.Lock()
ch := make(chan struct{})
go func() {
pgConn.bufferingReceiveMsg, pgConn.bufferingReceiveErr = pgConn.frontend.Receive()
pgConn.bufferingReceiveMux.Unlock()
close(ch)
}()
return ch
}
// ReceiveMessage receives one wire protocol message from the PostgreSQL server. It must only be used when the
// connection is not busy. e.g. It is an error to call ReceiveMessage while reading the result of a query. The messages
// are still handled by the core pgconn message handling system so receiving a NotificationResponse will still trigger
@@ -442,25 +426,13 @@ func (pgConn *PgConn) peekMessage() (pgproto3.BackendMessage, error) {
return pgConn.peekedMsg, nil
}
var msg pgproto3.BackendMessage
var err error
if pgConn.bufferingReceive {
pgConn.bufferingReceiveMux.Lock()
msg = pgConn.bufferingReceiveMsg
err = pgConn.bufferingReceiveErr
pgConn.bufferingReceiveMux.Unlock()
pgConn.bufferingReceive = false
// If a timeout error happened in the background try the read again.
var netErr net.Error
if errors.As(err, &netErr) && netErr.Timeout() {
msg, err = pgConn.frontend.Receive()
}
} else {
msg, err = pgConn.frontend.Receive()
}
msg, err := pgConn.frontend.Receive()
if err != nil {
if errors.Is(err, nbconn.ErrWouldBlock) {
return nil, err
}
// Close on anything other than timeout error - everything else is fatal
var netErr net.Error
isNetErr := errors.As(err, &netErr)
@@ -479,13 +451,6 @@ func (pgConn *PgConn) peekMessage() (pgproto3.BackendMessage, error) {
func (pgConn *PgConn) receiveMessage() (pgproto3.BackendMessage, error) {
msg, err := pgConn.peekMessage()
if err != nil {
// Close on anything other than timeout error - everything else is fatal
var netErr net.Error
isNetErr := errors.As(err, &netErr)
if !(isNetErr && netErr.Timeout()) {
pgConn.asyncClose()
}
return nil, err
}
pgConn.peekedMsg = nil
@@ -1173,62 +1138,58 @@ func (pgConn *PgConn) CopyFrom(ctx context.Context, r io.Reader, sql string) (Co
// Send copy to command
pgConn.frontend.SendQuery(&pgproto3.Query{String: sql})
err := pgConn.frontend.Flush()
if err != nil {
pgConn.asyncClose()
return CommandTag{}, err
}
// Send copy data
abortCopyChan := make(chan struct{})
copyErrChan := make(chan error, 1)
signalMessageChan := pgConn.signalMessage()
senderDoneChan := make(chan struct{})
go func() {
defer close(senderDoneChan)
buf := make([]byte, 0, 65536)
buf = append(buf, 'd')
sp := len(buf)
for {
n, readErr := r.Read(buf[5:cap(buf)])
if n > 0 {
buf = buf[0 : n+5]
pgio.SetInt32(buf[sp:], int32(n+4))
writeErr := pgConn.frontend.SendUnbufferedEncodedCopyData(buf)
if writeErr != nil {
// Write errors are always fatal, but we can't use asyncClose because we are in a different goroutine.
pgConn.conn.Close()
copyErrChan <- writeErr
return
}
}
if readErr != nil {
copyErrChan <- readErr
return
}
select {
case <-abortCopyChan:
return
default:
}
err = pgConn.conn.SetReadDeadline(nbconn.NonBlockingDeadline)
if err != nil {
pgConn.asyncClose()
return CommandTag{}, err
}
nonblocking := true
defer func() {
if nonblocking {
pgConn.conn.SetReadDeadline(time.Time{})
}
}()
var pgErr error
var copyErr error
for copyErr == nil && pgErr == nil {
select {
case copyErr = <-copyErrChan:
case <-signalMessageChan:
buf := iobufpool.Get(65536)
defer iobufpool.Put(buf)
buf[0] = 'd'
var readErr, pgErr error
for pgErr == nil {
// Read chunk from r.
var n int
n, readErr = r.Read(buf[5:cap(buf)])
// Send chunk to PostgreSQL.
if n > 0 {
buf = buf[0 : n+5]
pgio.SetInt32(buf[1:], int32(n+4))
writeErr := pgConn.frontend.SendUnbufferedEncodedCopyData(buf)
if writeErr != nil {
pgConn.asyncClose()
return CommandTag{}, err
}
}
// Abort loop if there was a read error.
if readErr != nil {
break
}
// Read messages until error or none available.
for pgErr == nil {
msg, err := pgConn.receiveMessage()
if err != nil {
if errors.Is(err, nbconn.ErrWouldBlock) {
break
}
pgConn.asyncClose()
return CommandTag{}, preferContextOverNetTimeoutError(ctx, err)
}
@@ -1236,18 +1197,22 @@ func (pgConn *PgConn) CopyFrom(ctx context.Context, r io.Reader, sql string) (Co
switch msg := msg.(type) {
case *pgproto3.ErrorResponse:
pgErr = ErrorResponseToPgError(msg)
default:
signalMessageChan = pgConn.signalMessage()
break
}
}
}
close(abortCopyChan)
<-senderDoneChan
if copyErr == io.EOF || pgErr != nil {
err = pgConn.conn.SetReadDeadline(time.Time{})
if err != nil {
pgConn.asyncClose()
return CommandTag{}, err
}
nonblocking = false
if readErr == io.EOF || pgErr != nil {
pgConn.frontend.Send(&pgproto3.CopyDone{})
} else {
pgConn.frontend.Send(&pgproto3.CopyFail{Message: copyErr.Error()})
pgConn.frontend.Send(&pgproto3.CopyFail{Message: readErr.Error()})
}
err = pgConn.frontend.Flush()
if err != nil {
@@ -1603,18 +1568,13 @@ func (pgConn *PgConn) ExecBatch(ctx context.Context, batch *Batch) *MultiResultR
batch.buf = (&pgproto3.Sync{}).Encode(batch.buf)
// A large batch can deadlock without concurrent reading and writing. If the Write fails the underlying net.Conn is
// closed. This is all that can be done without introducing a race condition or adding a concurrent safe communication
// channel to relay the error back. The practical effect of this is that the underlying Write error is not reported.
// The error the code reading the batch results receives will be a closed connection error.
//
// See https://github.com/jackc/pgx/issues/374.
go func() {
_, err := pgConn.conn.Write(batch.buf)
if err != nil {
pgConn.conn.Close()
}
}()
_, err := pgConn.conn.Write(batch.buf)
if err != nil {
multiResult.closed = true
multiResult.err = err
pgConn.unlock()
return multiResult
}
return multiResult
}
+37 -6
View File
@@ -1849,13 +1849,14 @@ func TestConnCancelRequest(t *testing.T) {
multiResult := pgConn.Exec(context.Background(), "select 'Hello, world', pg_sleep(2)")
// This test flickers without the Sleep. It appears that since Exec only sends the query and returns without awaiting a
// response that the CancelRequest can race it and be received before the query is running and cancellable. So wait a
// few milliseconds.
time.Sleep(50 * time.Millisecond)
go func() {
// The query is actually sent when multiResult.NextResult() is called. So wait to ensure it is sent.
// Once Flush is available this could use that instead.
time.Sleep(500 * time.Millisecond)
err = pgConn.CancelRequest(context.Background())
require.NoError(t, err)
err = pgConn.CancelRequest(context.Background())
require.NoError(t, err)
}()
for multiResult.NextResult() {
}
@@ -2027,6 +2028,36 @@ func TestFatalErrorReceivedAfterCommandComplete(t *testing.T) {
require.Error(t, err)
}
// https://github.com/jackc/pgconn/issues/27
func TestConnLargeResponseWhileWritingDoesNotDeadlock(t *testing.T) {
t.Parallel()
pgConn, err := pgconn.Connect(context.Background(), os.Getenv("PGX_TEST_CONN_STRING"))
require.NoError(t, err)
defer closeConn(t, pgConn)
_, err = pgConn.Exec(context.Background(), "set client_min_messages = debug5").ReadAll()
require.NoError(t, err)
// The actual contents of this test aren't important. What's important is a large amount of data to be written and
// because of client_min_messages = debug5 the server will return a large amount of data.
paramCount := math.MaxUint16
params := make([]string, 0, paramCount)
args := make([][]byte, 0, paramCount)
for i := 0; i < paramCount; i++ {
params = append(params, fmt.Sprintf("($%d::text)", i+1))
args = append(args, []byte(strconv.Itoa(i)))
}
sql := "values" + strings.Join(params, ", ")
result := pgConn.ExecParams(context.Background(), sql, args, nil, nil, nil).Read()
require.NoError(t, result.Err)
require.Len(t, result.Rows, paramCount)
ensureConnValid(t, pgConn)
}
func Example() {
pgConn, err := pgconn.Connect(context.Background(), os.Getenv("PGX_TEST_CONN_STRING"))
if err != nil {