1313import backoff
1414import os
1515
16+ class PeerDiscoveryException (Exception ):
17+ pass
18+
1619def construct_service_record ():
1720 # Drop our Pod's unique identity and replace with '_couchdb._tcp'
1821 return os .getenv ('SRV_RECORD' ) or '.' .join (['_couchdb' , '_tcp' ] + socket .getfqdn ().split ('.' )[1 :])
1922
2023@backoff .on_exception (
2124 backoff .expo ,
2225 dns .resolver .NXDOMAIN ,
23- max_tries = 10
26+ max_tries = 15
27+ )
28+ @backoff .on_exception (
29+ backoff .expo ,
30+ PeerDiscoveryException ,
31+ max_tries = 15
2432)
2533def discover_peers (service_record ):
26- print ('Resolving SRV record' , service_record )
27- answers = dns .resolver .query (service_record , 'SRV' )
34+ expected_peers_count = os .getenv ('COUCHDB_CLUSTER_SIZE' )
35+ if expected_peers_count :
36+ expected_peers_count = int (expected_peers_count )
37+ print ('Expecting' , expected_peers_count , 'peers...' )
38+ else :
39+ print ('Looks like COUCHDB_CLUSTER_SIZE is not set, will not wait for DNS to fully propagate...' )
40+ print ('Resolving SRV record:' , service_record )
2841 # Erlang requires that we drop the trailing period from the absolute DNS
2942 # name to form the hostname used for the Erlang node. This feels hacky
3043 # but not sure of a more official answer
31- return [rdata .target .to_text ()[:- 1 ] for rdata in answers ]
44+ answers = dns .resolver .query (service_record , 'SRV' )
45+ peers = [rdata .target .to_text ()[:- 1 ] for rdata in answers ]
46+ peers_count = len (peers )
47+ if expected_peers_count :
48+ print ('Discovered' , peers_count , 'of' , expected_peers_count , 'peers:' , peers )
49+ if peers_count != expected_peers_count :
50+ print ('Waiting for cluster DNS to fully propagate...' )
51+ raise PeerDiscoveryException
52+ else :
53+ print ('Discovered' , peers_count , 'peers:' , peers )
54+ return peers
3255
3356@backoff .on_exception (
3457 backoff .expo ,
@@ -45,7 +68,7 @@ def connect_the_dots(names):
4568 else :
4669 resp = requests .put (uri , data = json .dumps (doc ))
4770 while resp .status_code == 404 :
48- print ('Waiting for _nodes DB to be created ...' )
71+ print ('Waiting for _nodes DB to be created...' )
4972 time .sleep (5 )
5073 resp = requests .put (uri , data = json .dumps (doc ))
5174 print ('Adding cluster member' , name , resp .status_code )
0 commit comments