@@ -67,7 +67,7 @@ def get_contacts_mapping(cls):
6767 ]
6868
6969
70- def dedup_consecutive (table , id , order_by , dedup_on ):
70+ def dedup_consecutive (table , unique_id , id , order_by , dedup_on ):
7171 # Many of our raw data tables have a similar structure: a contact id column,
7272 # an insert time column, and several other pieces of raw data. If someone
7373 # inserts a "new" record for a certain id, but none of the raw data is
@@ -81,15 +81,16 @@ def dedup_consecutive(table, id, order_by, dedup_on):
8181 # not work well on null values.
8282
8383 sq = select (
84+ unique_id ,
8485 id ,
8586 order_by ,
8687 dedup_on .bool_op ("IS NOT DISTINCT FROM" )(
8788 func .lag (dedup_on ).over (partition_by = id , order_by = order_by )
8889 ).label ("is_dupe" ),
8990 ).subquery ()
9091
91- to_delete = select (sq .c [0 ], sq . c [ 1 ] ).where (sq .c [2 ]).subquery ()
92- return delete (table ).where (( id == to_delete .c [0 ]) & ( order_by == to_delete . c [ 1 ]) )
92+ to_delete = select (sq .c [0 ]).where (sq .c [3 ]).subquery ()
93+ return delete (table ).where (unique_id == to_delete .c [0 ])
9394
9495
9596def normalize_phone_number (number ):
@@ -181,6 +182,7 @@ def insert_from_file_df(cls, df, conn):
181182 conn .execute (
182183 dedup_consecutive (
183184 cls .__table__ ,
185+ unique_id = cls ._id ,
184186 id = cls .contact_id ,
185187 order_by = cls .created_date ,
186188 dedup_on = tuple_ (* dedup_on ),
@@ -249,6 +251,7 @@ def insert_from_df(cls, df, conn):
249251 conn .execute (
250252 dedup_consecutive (
251253 cls .__table__ ,
254+ unique_id = cls ._id ,
252255 id = cls .internal_id ,
253256 order_by = cls .created_date ,
254257 dedup_on = tuple_ (* dedup_on ),
@@ -315,6 +318,7 @@ def insert_from_file(cls, xl_file, conn):
315318 conn .execute (
316319 dedup_consecutive (
317320 cls .__table__ ,
321+ unique_id = cls ._id ,
318322 id = cls .number ,
319323 order_by = cls .created_date ,
320324 dedup_on = tuple_ (* dedup_on ),
0 commit comments