From 3170339f88d3a883c91effcbb1576b09480b8e7f Mon Sep 17 00:00:00 2001 From: circlekkk Date: Fri, 14 Apr 2023 15:03:37 +0800 Subject: [PATCH 1/9] test Signed-off-by: circlekkk --- python/federatedml/secureprotol/ecc/__init__.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 python/federatedml/secureprotol/ecc/__init__.py diff --git a/python/federatedml/secureprotol/ecc/__init__.py b/python/federatedml/secureprotol/ecc/__init__.py new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/python/federatedml/secureprotol/ecc/__init__.py @@ -0,0 +1 @@ + From 3000758a4e608aa24ed89cc559915d4f371e7f45 Mon Sep 17 00:00:00 2001 From: circlekkk Date: Fri, 14 Apr 2023 15:22:27 +0800 Subject: [PATCH 2/9] Signed-off-by: circlekkk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 修改: python/federatedml/util/consts.py --- python/federatedml/secureprotol/ecc/sm2.py | 62 ++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 python/federatedml/secureprotol/ecc/sm2.py diff --git a/python/federatedml/secureprotol/ecc/sm2.py b/python/federatedml/secureprotol/ecc/sm2.py new file mode 100644 index 0000000000..5243868b5f --- /dev/null +++ b/python/federatedml/secureprotol/ecc/sm2.py @@ -0,0 +1,62 @@ +# -*-coding: Utf-8 -*- + +from federatedml.util import LOGGER +import tinyec.ec as ec + + + + +SM2_p = 0xFFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF +SM2_a = 0xFFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFC +SM2_b = 0x28E9FA9E9D9F5E344D5A9E4BCF6509A7F39789F515AB8F92DDBCBD414D940E93 +SM2_n = 0xFFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123 +SM2_Gx = 0x32C4AE2C1F1981195F9904466A39C9948FE30BBFF2660BE1715A4589334C74C7 +SM2_Gy = 0xBC3736A2F4F6779C59BDCEE36B692153D0A9877CC62A474002DF32E52139F0A0 +PARA_SIZE = 32 + + + +def to_byte(x, size=None): + if isinstance(x, int): + if size is None: + size = 0 + tmp = x >> 64 + while tmp: + size += 8 + tmp >>= 64 + tmp = x >> (size << 3) + while tmp: + size += 1 + tmp >>= 8 + elif x >> (size << 3): + x &= (1 << (size << 3)) - 1 + return x.to_bytes(size, byteorder='big') + + +class SM2: + def __init__(self, p=SM2_p, a=SM2_a, b=SM2_b, n=SM2_n, G=(SM2_Gx, SM2_Gy), h=1,curve_key=None): + field = ec.SubGroup(p, G, n, h) + self.curve = ec.Curve(a, b, field) + self.G = ec.Point(self.curve, G[0], G[1]) + keypair = ec.make_keypair(self.curve) + self.private_key = curve_key or keypair.priv + self.p, self.a, self.b, self.n, self.h = p, a, b, n, h + + def mult_point(self, k): + return k * self.G + + def encrypt(self, hashvalue): + int_hashvalue = int.from_bytes(hashvalue, byteorder='big') + hash_to_curve = self.mult_point(int_hashvalue) + c = (hash_to_curve.x + hash_to_curve.y) * self.private_key + return to_byte(c) + + def diffie_hellman(self, ciphertext): + ciphertext = int.from_bytes(ciphertext, byteorder='big') + c2 = ciphertext * self.private_key + return to_byte(c2) + + def get_private_key(self): + return self.private_key + + From 410f5a0cec55a0af45ea829dbe389086d2739f24 Mon Sep 17 00:00:00 2001 From: circlekkk Date: Fri, 14 Apr 2023 15:27:48 +0800 Subject: [PATCH 3/9] Revert "Signed-off-by: circlekkk " This reverts commit 3000758a4e608aa24ed89cc559915d4f371e7f45. --- python/federatedml/secureprotol/ecc/sm2.py | 62 ---------------------- 1 file changed, 62 deletions(-) delete mode 100644 python/federatedml/secureprotol/ecc/sm2.py diff --git a/python/federatedml/secureprotol/ecc/sm2.py b/python/federatedml/secureprotol/ecc/sm2.py deleted file mode 100644 index 5243868b5f..0000000000 --- a/python/federatedml/secureprotol/ecc/sm2.py +++ /dev/null @@ -1,62 +0,0 @@ -# -*-coding: Utf-8 -*- - -from federatedml.util import LOGGER -import tinyec.ec as ec - - - - -SM2_p = 0xFFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF -SM2_a = 0xFFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFC -SM2_b = 0x28E9FA9E9D9F5E344D5A9E4BCF6509A7F39789F515AB8F92DDBCBD414D940E93 -SM2_n = 0xFFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123 -SM2_Gx = 0x32C4AE2C1F1981195F9904466A39C9948FE30BBFF2660BE1715A4589334C74C7 -SM2_Gy = 0xBC3736A2F4F6779C59BDCEE36B692153D0A9877CC62A474002DF32E52139F0A0 -PARA_SIZE = 32 - - - -def to_byte(x, size=None): - if isinstance(x, int): - if size is None: - size = 0 - tmp = x >> 64 - while tmp: - size += 8 - tmp >>= 64 - tmp = x >> (size << 3) - while tmp: - size += 1 - tmp >>= 8 - elif x >> (size << 3): - x &= (1 << (size << 3)) - 1 - return x.to_bytes(size, byteorder='big') - - -class SM2: - def __init__(self, p=SM2_p, a=SM2_a, b=SM2_b, n=SM2_n, G=(SM2_Gx, SM2_Gy), h=1,curve_key=None): - field = ec.SubGroup(p, G, n, h) - self.curve = ec.Curve(a, b, field) - self.G = ec.Point(self.curve, G[0], G[1]) - keypair = ec.make_keypair(self.curve) - self.private_key = curve_key or keypair.priv - self.p, self.a, self.b, self.n, self.h = p, a, b, n, h - - def mult_point(self, k): - return k * self.G - - def encrypt(self, hashvalue): - int_hashvalue = int.from_bytes(hashvalue, byteorder='big') - hash_to_curve = self.mult_point(int_hashvalue) - c = (hash_to_curve.x + hash_to_curve.y) * self.private_key - return to_byte(c) - - def diffie_hellman(self, ciphertext): - ciphertext = int.from_bytes(ciphertext, byteorder='big') - c2 = ciphertext * self.private_key - return to_byte(c2) - - def get_private_key(self): - return self.private_key - - From b03abbec290e8dd842628a4507aa4a65aca9f392 Mon Sep 17 00:00:00 2001 From: circlekkk Date: Sat, 6 May 2023 09:40:23 +0800 Subject: [PATCH 4/9] Adding SM2 Elliptic Curve,psi based on ecdh Signed-off-by: circlekkk --- python/fate_client/pipeline/param/consts.py | 2 +- python/federatedml/param/intersect_param.py | 2 +- python/federatedml/secureprotol/ecc/sm2.py | 105 ++++++++++++++++++ .../secureprotol/elliptic_curve_encryption.py | 13 ++- python/federatedml/util/consts.py | 1 + 5 files changed, 117 insertions(+), 6 deletions(-) create mode 100644 python/federatedml/secureprotol/ecc/sm2.py diff --git a/python/fate_client/pipeline/param/consts.py b/python/fate_client/pipeline/param/consts.py index 21fee8a3f0..3f2b4e039d 100644 --- a/python/fate_client/pipeline/param/consts.py +++ b/python/fate_client/pipeline/param/consts.py @@ -344,7 +344,7 @@ # curve names CURVE25519 = 'curve25519' - +SM2='sm2' # positive unlabeled PROBABILITY = "probability" QUANTITY = "quantity" diff --git a/python/federatedml/param/intersect_param.py b/python/federatedml/param/intersect_param.py index 7b59ccc96b..dd3cf8c887 100644 --- a/python/federatedml/param/intersect_param.py +++ b/python/federatedml/param/intersect_param.py @@ -251,7 +251,7 @@ def check(self): consts.SM3], f"{descr}hash_method") - self.curve = self.check_and_change_lower(self.curve, [consts.CURVE25519], f"{descr}curve") + self.curve = self.check_and_change_lower(self.curve, [consts.CURVE25519,consts.SM2], f"{descr}curve") LOGGER.debug("Finish ECDHParam parameter check!") return True diff --git a/python/federatedml/secureprotol/ecc/sm2.py b/python/federatedml/secureprotol/ecc/sm2.py new file mode 100644 index 0000000000..da749f9744 --- /dev/null +++ b/python/federatedml/secureprotol/ecc/sm2.py @@ -0,0 +1,105 @@ +# -*-coding: Utf-8 -*- +''' +==================== +@File : sm2 .py +author: circle +Time:2023/4/3 +@Desc: +===================== +''' +from federatedml.util import LOGGER +import tinyec.ec as ec + +LOGGER.setLevel('DEBUG') + +# 国家密码管理局:SM2椭圆曲线公钥密码算法推荐曲线参数 +SM2_p = 0xFFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF +SM2_a = 0xFFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFC +SM2_b = 0x28E9FA9E9D9F5E344D5A9E4BCF6509A7F39789F515AB8F92DDBCBD414D940E93 +SM2_n = 0xFFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123 +SM2_Gx = 0x32C4AE2C1F1981195F9904466A39C9948FE30BBFF2660BE1715A4589334C74C7 +SM2_Gy = 0xBC3736A2F4F6779C59BDCEE36B692153D0A9877CC62A474002DF32E52139F0A0 +PARA_SIZE = 32 # 参数长度(字节) + + +# 转换为bytes,第二参数为字节数(可不填) +def to_byte(x, size=None): + if isinstance(x, int): + if size is None: # 计算合适的字节数 + size = 0 + tmp = x >> 64 + while tmp: + size += 8 + tmp >>= 64 + tmp = x >> (size << 3) + while tmp: + size += 1 + tmp >>= 8 + elif x >> (size << 3): # 指定的字节数不够则截取低位 + x &= (1 << (size << 3)) - 1 + return x.to_bytes(size, byteorder='big') + + +class SM2: + def __init__(self, p=SM2_p, a=SM2_a, b=SM2_b, n=SM2_n, G=(SM2_Gx, SM2_Gy), h=1,curve_key=None): + field = ec.SubGroup(p, G, n, h) + self.curve = ec.Curve(a, b, field) + self.G = ec.Point(self.curve, G[0], G[1]) + keypair = ec.make_keypair(self.curve) + self.private_key = curve_key or keypair.priv + self.p, self.a, self.b, self.n, self.h = p, a, b, n, h + + def mult_point(self, k): + return k * self.G + + def encrypt(self, hashvalue): + LOGGER.info('!!!!sm2 encry start') + int_hashvalue = int.from_bytes(hashvalue, byteorder='big') + hash_to_curve = self.mult_point(int_hashvalue) + c = (hash_to_curve.x + hash_to_curve.y) * self.private_key + LOGGER.info('!!!!sm2 encry finish') + return to_byte(c) + + def diffie_hellman(self, ciphertext): + ciphertext = int.from_bytes(ciphertext, byteorder='big') + c2 = ciphertext * self.private_key + return to_byte(c2) + + def get_private_key(self): + return self.private_key + + +def sha256_hash(plaintext): + from federatedml.secureprotol.hash.hash_factory import Hash + hash_operator = Hash('sha256', hex_output=False) + hashvalue = hash_operator.compute(plaintext, suffix_salt='12345') + print(f'hash finished!\n,plaintext is:{plaintext},\n,hashvalue is:{hashvalue}') + return hashvalue + + +if __name__ == '__main__': + s1 = SM2() + print('s1 pr key:', s1.private_key, type(s1.private_key)) + plaintext1 = b'12345' + hashvalue1 = sha256_hash(plaintext1) + cipher1 = s1.encrypt(hashvalue1) + + s2 = SM2() + print('s2 pr key:', s2.private_key) + plaintext2 = b'12345' + hashvalue2 = sha256_hash(plaintext2) + cipher2 = s2.encrypt(hashvalue2) + # + print('cipher1:', cipher1, type(cipher1)) + + print('cipher2:', cipher2) + + sign1 = s1.diffie_hellman(cipher2) + sign2 = s2.diffie_hellman(cipher1) + # + print('sign1:', sign1, type(sign1)) + print('sign2:', sign2, type(sign2)) + if sign1 == sign2: + print('True') + else: + print('False') diff --git a/python/federatedml/secureprotol/elliptic_curve_encryption.py b/python/federatedml/secureprotol/elliptic_curve_encryption.py index 4eaea5f581..937806f1b3 100644 --- a/python/federatedml/secureprotol/elliptic_curve_encryption.py +++ b/python/federatedml/secureprotol/elliptic_curve_encryption.py @@ -19,7 +19,7 @@ from fate_crypto.psi import Curve25519 - +from federatedml.secureprotol.ecc.sm2 import SM2 class EllipticCurve(object): """ @@ -31,9 +31,14 @@ def __init__(self, curve_name, curve_key=None): @staticmethod def __get_curve_instance(curve_name, curve_key): - if curve_key is None: - return Curve25519() - return Curve25519(curve_key) + if curve_name == 'curve25519': + if curve_key is None: + return Curve25519() + return Curve25519(curve_key) + if curve_name == 'sm2': + if curve_key is None: + return SM2() + return SM2(curve_key) def get_curve_key(self): return self.curve.get_private_key() diff --git a/python/federatedml/util/consts.py b/python/federatedml/util/consts.py index d3685a8e52..357fa8905c 100644 --- a/python/federatedml/util/consts.py +++ b/python/federatedml/util/consts.py @@ -349,6 +349,7 @@ pytorch_backend = 'pytorch' keras_backend = 'keras' CURVE25519 = 'curve25519' +SM2='sm2' # HOMO NN Framework FEDAVG_TRAINER = 'fedavg_trainer' From 7d54b1049cf7f20d81dacb0ebe2fd4d75470337a Mon Sep 17 00:00:00 2001 From: circlekkk <44366206+circlekkk@users.noreply.github.com> Date: Sat, 6 May 2023 11:06:24 +0800 Subject: [PATCH 5/9] Update sm2.py Signed-off-by: circlekkk <44366206+circlekkk@users.noreply.github.com> --- python/federatedml/secureprotol/ecc/sm2.py | 35 +--------------------- 1 file changed, 1 insertion(+), 34 deletions(-) diff --git a/python/federatedml/secureprotol/ecc/sm2.py b/python/federatedml/secureprotol/ecc/sm2.py index da749f9744..a4e5977d66 100644 --- a/python/federatedml/secureprotol/ecc/sm2.py +++ b/python/federatedml/secureprotol/ecc/sm2.py @@ -1,11 +1,5 @@ # -*-coding: Utf-8 -*- -''' -==================== -@File : sm2 .py -author: circle -Time:2023/4/3 -@Desc: -===================== + ''' from federatedml.util import LOGGER import tinyec.ec as ec @@ -76,30 +70,3 @@ def sha256_hash(plaintext): print(f'hash finished!\n,plaintext is:{plaintext},\n,hashvalue is:{hashvalue}') return hashvalue - -if __name__ == '__main__': - s1 = SM2() - print('s1 pr key:', s1.private_key, type(s1.private_key)) - plaintext1 = b'12345' - hashvalue1 = sha256_hash(plaintext1) - cipher1 = s1.encrypt(hashvalue1) - - s2 = SM2() - print('s2 pr key:', s2.private_key) - plaintext2 = b'12345' - hashvalue2 = sha256_hash(plaintext2) - cipher2 = s2.encrypt(hashvalue2) - # - print('cipher1:', cipher1, type(cipher1)) - - print('cipher2:', cipher2) - - sign1 = s1.diffie_hellman(cipher2) - sign2 = s2.diffie_hellman(cipher1) - # - print('sign1:', sign1, type(sign1)) - print('sign2:', sign2, type(sign2)) - if sign1 == sign2: - print('True') - else: - print('False') From 32e3f26e3c3fb1c23d34f9ac6f3e7ebda4584ca3 Mon Sep 17 00:00:00 2001 From: circlekkk <44366206+circlekkk@users.noreply.github.com> Date: Sat, 6 May 2023 11:06:57 +0800 Subject: [PATCH 6/9] Update sm2.py Signed-off-by: circlekkk <44366206+circlekkk@users.noreply.github.com> --- python/federatedml/secureprotol/ecc/sm2.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/python/federatedml/secureprotol/ecc/sm2.py b/python/federatedml/secureprotol/ecc/sm2.py index a4e5977d66..de89fcc1bb 100644 --- a/python/federatedml/secureprotol/ecc/sm2.py +++ b/python/federatedml/secureprotol/ecc/sm2.py @@ -63,10 +63,3 @@ def get_private_key(self): return self.private_key -def sha256_hash(plaintext): - from federatedml.secureprotol.hash.hash_factory import Hash - hash_operator = Hash('sha256', hex_output=False) - hashvalue = hash_operator.compute(plaintext, suffix_salt='12345') - print(f'hash finished!\n,plaintext is:{plaintext},\n,hashvalue is:{hashvalue}') - return hashvalue - From bcc007182461b2b323da2bdd019f5f1db4d503bd Mon Sep 17 00:00:00 2001 From: circlekkk Date: Tue, 19 Dec 2023 15:22:46 +0800 Subject: [PATCH 7/9] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E7=94=A8pipeline?= =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=95=B0=E6=8D=AE=E5=92=8C=E8=B7=91=E7=BA=B5?= =?UTF-8?q?=E5=90=91xgb?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: circlekkk --- .../pipeline/pipline_hetero_secureboost.py | 82 +++++++++++++++++++ doc/tutorial/pipline_upload_csv.py | 30 +++++++ .../secureprotol/elliptic_curve_encryption.py | 10 +-- 3 files changed, 117 insertions(+), 5 deletions(-) create mode 100644 doc/tutorial/pipeline/pipline_hetero_secureboost.py create mode 100644 doc/tutorial/pipline_upload_csv.py diff --git a/doc/tutorial/pipeline/pipline_hetero_secureboost.py b/doc/tutorial/pipeline/pipline_hetero_secureboost.py new file mode 100644 index 0000000000..a77623b2e1 --- /dev/null +++ b/doc/tutorial/pipeline/pipline_hetero_secureboost.py @@ -0,0 +1,82 @@ +# 连接FATE flow +# !pipeline init --ip 127.0.0.1 --port 9380 +from pipeline.backend.pipeline import PipeLine +from pipeline.component import Reader, DataTransform, Intersection, HeteroSecureBoost, Evaluation +from pipeline.interface import Data +def train_pipeline(): + # 创建pipeline实例 + pipeline = PipeLine() \ + .set_initiator(role='guest', party_id=9999) \ + .set_roles(guest=9999, host=10000) + # load data + reader_0 = Reader(name="reader_0") + # set guest parameter + reader_0.get_party_instance(role='guest', party_id=9999).component_param( + table={"name": "breast_hetero_guest", "namespace": "experiment"}) + # set host parameter + reader_0.get_party_instance(role='host', party_id=10000).component_param( + table={"name": "breast_hetero_host", "namespace": "experiment"}) + # 添加DataTransform组件以将原始数据解析到数据实例中 + data_transform_0 = DataTransform(name="data_transform_0") + # set guest parameter + data_transform_0.get_party_instance(role='guest', party_id=9999).component_param( + with_label=True) + data_transform_0.get_party_instance(role='host', party_id=[10000]).component_param( + with_label=False) + # 添加求交组件以执行纵向场景的PSI + intersect_0 = Intersection(name="intersect_0") + # 定义HeteroSecureBoost组件。将为所有相关方设置以下参数。 + hetero_secureboost_0 = HeteroSecureBoost(name="hetero_secureboost_0", + num_trees=5, + bin_num=16, + task_type="classification", + objective_param={"objective": "cross_entropy"}, + encrypt_param={"method": "paillier"}, + tree_param={"max_depth": 3}) + # 模型评估组件 + evaluation_0 = Evaluation(name="evaluation_0", eval_type="binary") + # 将组件添加到管道,按执行顺序 + # -data_transform_0 用 reader_0的输出数据 + # -intersect_0 用 data_transform_0的输出数据 + # -heter_securebost_0用intersect_0的输出数据 + # -评估0用纵向securebost0对训练数据的预测结果 + pipeline.add_component(reader_0) + pipeline.add_component(data_transform_0, data=Data(data=reader_0.output.data)) + pipeline.add_component(intersect_0, data=Data(data=data_transform_0.output.data)) + pipeline.add_component(hetero_secureboost_0, data=Data(train_data=intersect_0.output.data)) + pipeline.add_component(evaluation_0, data=Data(data=hetero_secureboost_0.output.data)) + pipeline.compile() + # submit pipeline + pipeline.fit() + # 一旦完成了训练,就可以将训练后的模型用于预测。(可选)保存经过训练的管道以备将来使用。 + # 模型保存 + pipeline.dump("hetero_secureboost_pipeline_saved.pkl") +def predict_pipeline(): + # 模型预测 + # 首先,从训练管道部署所需组件 + pipeline = PipeLine.load_model_from_file('hetero_secureboost_pipeline_saved.pkl') + pipeline.deploy_component([pipeline.data_transform_0, pipeline.intersect_0, pipeline.hetero_secureboost_0]) + # 定义用于读取预测数据的新reader组件 + reader_1 = Reader(name="reader_1") + reader_1.get_party_instance(role="guest", party_id=9999).component_param(table={"name": "breast_hetero_guest", "namespace": "experiment"}) + reader_1.get_party_instance(role="host", party_id=10000).component_param(table={"name": "breast_hetero_host", "namespace": "experiment"}) + # (可选)定义新的评估组件。 + evaluation_0 = Evaluation(name="evaluation_0", eval_type="binary") + # 按执行顺序添加组件到预测管道 + predict_pipeline = PipeLine() + predict_pipeline.add_component(reader_1)\ + .add_component(pipeline, + data=Data(predict_input={pipeline.data_transform_0.input.data: reader_1.output.data}))\ + .add_component(evaluation_0, data=Data(data=pipeline.hetero_secureboost_0.output.data)) + # 执行预测 + predict_pipeline.predict() + +if __name__=='__main__': + train_pipeline() + + + + + + + diff --git a/doc/tutorial/pipline_upload_csv.py b/doc/tutorial/pipline_upload_csv.py new file mode 100644 index 0000000000..fe5a52c775 --- /dev/null +++ b/doc/tutorial/pipline_upload_csv.py @@ -0,0 +1,30 @@ +# 连接FATE flow +# !pipeline init --ip 127.0.0.1 --port 9380 +from pipeline.backend.pipeline import PipeLine +# 创建一个pipeline实例 +pipeline_upload = PipeLine().set_initiator(role='guest', party_id=9999).set_roles(guest=9999) +partition = 4 +# 定义将在FATE作业配置中使用的表名和命名空间 +dense_data_guest = {"name": "breast_hetero_guest", "namespace": f"experiment"} +dense_data_host = {"name": "breast_hetero_host", "namespace": f"experiment"} +tag_data = {"name": "breast_hetero_host", "namespace": f"experiment"} +# 添加要上传的数据 +import os + +data_base = "/root/PycharmProjects/FATE/" +pipeline_upload.add_upload_data(file=os.path.join(data_base, "examples/data/breast_hetero_guest.csv"), + table_name=dense_data_guest["name"], # table name + namespace=dense_data_guest["namespace"], # namespace + head=1, partition=partition) # data info + +pipeline_upload.add_upload_data(file=os.path.join(data_base, "examples/data/breast_hetero_host.csv"), + table_name=dense_data_host["name"], + namespace=dense_data_host["namespace"], + head=1, partition=partition) + +pipeline_upload.add_upload_data(file=os.path.join(data_base, "examples/data/breast_hetero_host.csv"), + table_name=tag_data["name"], + namespace=tag_data["namespace"], + head=1, partition=partition) +# 上传数据 +pipeline_upload.upload(drop=1) \ No newline at end of file diff --git a/python/federatedml/secureprotol/elliptic_curve_encryption.py b/python/federatedml/secureprotol/elliptic_curve_encryption.py index 937806f1b3..1bb097f537 100644 --- a/python/federatedml/secureprotol/elliptic_curve_encryption.py +++ b/python/federatedml/secureprotol/elliptic_curve_encryption.py @@ -19,7 +19,7 @@ from fate_crypto.psi import Curve25519 -from federatedml.secureprotol.ecc.sm2 import SM2 +# from federatedml.secureprotol.ecc.sm2 import SM2 class EllipticCurve(object): """ @@ -35,10 +35,10 @@ def __get_curve_instance(curve_name, curve_key): if curve_key is None: return Curve25519() return Curve25519(curve_key) - if curve_name == 'sm2': - if curve_key is None: - return SM2() - return SM2(curve_key) + # if curve_name == 'sm2': + # if curve_key is None: + # return SM2() + # return SM2(curve_key) def get_curve_key(self): return self.curve.get_private_key() From 4913124ef306645d5655b1af6fc12a2565f1185b Mon Sep 17 00:00:00 2001 From: circlekkk Date: Tue, 19 Dec 2023 16:15:00 +0800 Subject: [PATCH 8/9] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=97=A0=E7=8A=B6?= =?UTF-8?q?=E6=80=81=E7=BA=B5=E5=90=91xgb=E5=88=9D=E5=A7=8B=E4=BB=A3?= =?UTF-8?q?=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: circlekkk --- .../pipeline/pipline_hetero_secureboost.py | 3 +- .../steteless_hetero_secureboost/__init__.py | 11 + .../stateless_hetero_secureboost_guest.py | 504 ++++++++++++++++++ .../stateless_hetero_secureboost_host.py | 293 ++++++++++ python/federatedml/secureprotol/ecc/sm2.py | 1 + 5 files changed, 811 insertions(+), 1 deletion(-) create mode 100644 python/federatedml/ensemble/secureboost/steteless_hetero_secureboost/__init__.py create mode 100644 python/federatedml/ensemble/secureboost/steteless_hetero_secureboost/stateless_hetero_secureboost_guest.py create mode 100644 python/federatedml/ensemble/secureboost/steteless_hetero_secureboost/stateless_hetero_secureboost_host.py diff --git a/doc/tutorial/pipeline/pipline_hetero_secureboost.py b/doc/tutorial/pipeline/pipline_hetero_secureboost.py index a77623b2e1..10e55e09d1 100644 --- a/doc/tutorial/pipeline/pipline_hetero_secureboost.py +++ b/doc/tutorial/pipeline/pipline_hetero_secureboost.py @@ -72,7 +72,8 @@ def predict_pipeline(): predict_pipeline.predict() if __name__=='__main__': - train_pipeline() + # train_pipeline() + predict_pipeline() diff --git a/python/federatedml/ensemble/secureboost/steteless_hetero_secureboost/__init__.py b/python/federatedml/ensemble/secureboost/steteless_hetero_secureboost/__init__.py new file mode 100644 index 0000000000..fa29f4c09b --- /dev/null +++ b/python/federatedml/ensemble/secureboost/steteless_hetero_secureboost/__init__.py @@ -0,0 +1,11 @@ +# -*-coding: Utf-8 -*- +''' +==================== +@File : __init__.py .py +author: circle +Time:2023/12/19 +@Desc: +===================== +''' +if __name__ == '__main__': + print('Python') diff --git a/python/federatedml/ensemble/secureboost/steteless_hetero_secureboost/stateless_hetero_secureboost_guest.py b/python/federatedml/ensemble/secureboost/steteless_hetero_secureboost/stateless_hetero_secureboost_guest.py new file mode 100644 index 0000000000..485b798b33 --- /dev/null +++ b/python/federatedml/ensemble/secureboost/steteless_hetero_secureboost/stateless_hetero_secureboost_guest.py @@ -0,0 +1,504 @@ +import numpy as np +from operator import itemgetter +from federatedml.util import consts +from federatedml.util import LOGGER +from federatedml.ensemble.boosting import HeteroBoostingGuest +from federatedml.param.boosting_param import HeteroSecureBoostParam, DecisionTreeParam +from federatedml.util.io_check import assert_io_num_rows_equal +from federatedml.util.anonymous_generator_util import Anonymous +from federatedml.statistic.data_overview import with_weight, get_max_sample_weight +from federatedml.ensemble.basic_algorithms.decision_tree.tree_core.feature_importance import FeatureImportance +from federatedml.transfer_variable.transfer_class.hetero_secure_boosting_predict_transfer_variable import \ + HeteroSecureBoostTransferVariable +from federatedml.ensemble.basic_algorithms.decision_tree.tree_core import tree_plan as plan +from federatedml.protobuf.generated.boosting_tree_model_meta_pb2 import BoostingTreeModelMeta +from federatedml.protobuf.generated.boosting_tree_model_meta_pb2 import ObjectiveMeta +from federatedml.protobuf.generated.boosting_tree_model_meta_pb2 import QuantileMeta +from federatedml.protobuf.generated.boosting_tree_model_param_pb2 import BoostingTreeModelParam +from federatedml.protobuf.generated.boosting_tree_model_param_pb2 import FeatureImportanceInfo +from federatedml.ensemble.secureboost.secureboost_util.tree_model_io import load_hetero_tree_learner, \ + produce_hetero_tree_learner +from federatedml.ensemble.secureboost.secureboost_util.boosting_tree_predict import sbt_guest_predict, \ + mix_sbt_guest_predict, EINI_guest_predict +from federatedml.ensemble.secureboost.secureboost_util.subsample import goss_sampling + + +class HeteroSecureBoostingTreeGuest(HeteroBoostingGuest): + + def __init__(self): + super(HeteroSecureBoostingTreeGuest, self).__init__() + + self.tree_param = DecisionTreeParam() # decision tree param + self.use_missing = False + self.zero_as_missing = False + self.cur_epoch_idx = -1 + self.grad_and_hess = None + self.feature_importances_ = {} + self.model_param = HeteroSecureBoostParam() + self.complete_secure = False + self.data_alignment_map = {} + self.hetero_sbt_transfer_variable = HeteroSecureBoostTransferVariable() + self.model_name = 'HeteroSecureBoost' + self.max_sample_weight = 1 + self.max_sample_weight_computed = False + self.re_compute_goss_sample_weight = False + self.cipher_compressing = False + + self.enable_goss = False # GOSS + self.top_rate = None + self.other_rate = None + self.new_ver = True + + self.boosting_strategy = consts.STD_TREE # default work mode is std + + # fast sbt param + self.tree_num_per_party = 1 + self.guest_depth = 0 + self.host_depth = 0 + self.init_tree_plan = False + self.tree_plan = [] + + # multi-classification mode + self.multi_mode = consts.SINGLE_OUTPUT + + # EINI predict param + self.EINI_inference = False + self.EINI_random_mask = False + + def _init_model(self, param: HeteroSecureBoostParam): + + super(HeteroSecureBoostingTreeGuest, self)._init_model(param) + self.tree_param = param.tree_param + self.use_missing = param.use_missing + self.zero_as_missing = param.zero_as_missing + self.complete_secure = param.complete_secure + self.enable_goss = param.run_goss + self.top_rate = param.top_rate + self.other_rate = param.other_rate + self.cipher_compressing = param.cipher_compress + self.new_ver = param.new_ver + self.EINI_inference = param.EINI_inference + self.EINI_random_mask = param.EINI_random_mask + + # fast sbt param + self.tree_num_per_party = param.tree_num_per_party + self.boosting_strategy = param.boosting_strategy + self.guest_depth = param.guest_depth + self.host_depth = param.host_depth + + if self.use_missing: + self.tree_param.use_missing = self.use_missing + self.tree_param.zero_as_missing = self.zero_as_missing + + self.multi_mode = param.multi_mode + + def process_sample_weights(self, grad_and_hess, data_with_sample_weight=None): + + # add sample weights to gradient and hessian + if data_with_sample_weight is not None: + if with_weight(data_with_sample_weight): + LOGGER.info('weighted sample detected, multiply g/h by weights') + grad_and_hess = grad_and_hess.join(data_with_sample_weight, + lambda v1, v2: (v1[0] * v2.weight, v1[1] * v2.weight)) + if not self.max_sample_weight_computed: + self.max_sample_weight = get_max_sample_weight(data_with_sample_weight) + LOGGER.info('max sample weight is {}'.format(self.max_sample_weight)) + self.max_sample_weight_computed = True + + return grad_and_hess + + def get_tree_plan(self, idx): + + if not self.init_tree_plan: + tree_plan = plan.create_tree_plan(self.boosting_strategy, k=self.tree_num_per_party, + tree_num=self.boosting_round, + host_list=self.component_properties.host_party_idlist, + complete_secure=self.complete_secure) + self.tree_plan += tree_plan + self.init_tree_plan = True + + LOGGER.info('tree plan is {}'.format(self.tree_plan)) + return self.tree_plan[idx] + + def check_host_number(self, tree_type): + host_num = len(self.component_properties.host_party_idlist) + LOGGER.info('host number is {}'.format(host_num)) + if tree_type == plan.tree_type_dict['layered_tree']: + assert host_num == 1, 'only 1 host party is allowed in layered mode' + + def compute_grad_and_hess(self, y_hat, y, data_with_sample_weight=None): + + LOGGER.info("compute grad and hess") + loss_method = self.loss + if self.task_type == consts.CLASSIFICATION: + grad_and_hess = y.join(y_hat, lambda y, f_val: + (loss_method.compute_grad(y, loss_method.predict(f_val)), + loss_method.compute_hess(y, loss_method.predict(f_val)))) + else: + grad_and_hess = y.join(y_hat, lambda y, f_val: + (loss_method.compute_grad(y, f_val), + loss_method.compute_hess(y, f_val))) + + grad_and_hess = self.process_sample_weights(grad_and_hess, data_with_sample_weight) + + return grad_and_hess + + @staticmethod + def get_grad_and_hess(g_h, dim=0): + LOGGER.info("get grad and hess of tree {}".format(dim)) + grad_and_hess_subtree = g_h.mapValues( + lambda grad_and_hess: (grad_and_hess[0][dim], grad_and_hess[1][dim])) + return grad_and_hess_subtree + + def update_feature_importance(self, tree_feature_importance): + for fid in tree_feature_importance: + if fid not in self.feature_importances_: + self.feature_importances_[fid] = tree_feature_importance[fid] + else: + self.feature_importances_[fid] += tree_feature_importance[fid] + LOGGER.debug('cur feature importance {}'.format(self.feature_importances_)) + + def align_feature_importance_guest(self, suffix): + """ + receive feature importance from host to update global feature importance + """ + host_feature_importance_list = self.hetero_sbt_transfer_variable.host_feature_importance.get( + idx=-1, suffix=suffix) + # remove host importance, make sure host importance is latest when host anonymous features are updated + pop_key = [] + for key in self.feature_importances_: + sitename, fid = key + if consts.GUEST not in sitename: + pop_key.append(key) + + for k in pop_key: + self.feature_importances_.pop(k) + + for i in host_feature_importance_list: + self.feature_importances_.update(i) + + def goss_sample(self): + + sampled_gh = goss_sampling(self.grad_and_hess, self.top_rate, self.other_rate) + return sampled_gh + + def on_epoch_prepare(self, epoch_idx): + """ + + Parameters + ---------- + epoch_idx cur epoch idx + + Returns None + ------- + + Prepare g, h, sample weights, sampling at the beginning of every epoch + + """ + if self.cur_epoch_idx != epoch_idx: + self.grad_and_hess = self.compute_grad_and_hess(self.y_hat, self.y, self.data_inst) + self.cur_epoch_idx = epoch_idx + # goss sampling + if self.enable_goss: + if not self.re_compute_goss_sample_weight: + self.max_sample_weight = self.max_sample_weight * ((1 - self.top_rate) / self.other_rate) + self.grad_and_hess = self.goss_sample() + + def preprocess(self): + if self.multi_mode == consts.MULTI_OUTPUT: + # re-set dimension + self.booster_dim = 1 + + def postprocess(self): + pass + + def fit_a_learner(self, epoch_idx: int, booster_dim: int): + + self.on_epoch_prepare(epoch_idx) + + if self.multi_mode == consts.MULTI_OUTPUT: + g_h = self.grad_and_hess + else: + g_h = self.get_grad_and_hess(self.grad_and_hess, booster_dim) + + flow_id = self.generate_flowid(epoch_idx, booster_dim) + complete_secure = True if (epoch_idx == 0 and self.complete_secure) else False + + tree_type, target_host_id = None, None + fast_sbt = (self.boosting_strategy == consts.MIX_TREE or self.boosting_strategy == consts.LAYERED_TREE) + if fast_sbt: + tree_type, target_host_id = self.get_tree_plan(epoch_idx) + self.check_host_number(tree_type) + + tree = produce_hetero_tree_learner(role=self.role, tree_param=self.tree_param, flow_id=flow_id, + data_bin=self.data_bin, bin_split_points=self.bin_split_points, + bin_sparse_points=self.bin_sparse_points, task_type=self.task_type, + valid_features=self.sample_valid_features(), + host_party_list=self.component_properties.host_party_idlist, + runtime_idx=self.component_properties.local_partyid, + cipher_compress=self.cipher_compressing, + g_h=g_h, encrypter=self.encrypter, + goss_subsample=self.enable_goss, + objective=self.objective_param.objective, + complete_secure=complete_secure, max_sample_weights=self.max_sample_weight, + fast_sbt=fast_sbt, tree_type=tree_type, target_host_id=target_host_id, + guest_depth=self.guest_depth, host_depth=self.host_depth, + mo_tree=(self.multi_mode == consts.MULTI_OUTPUT), + class_num=len(self.classes_) if len(self.classes_) > 2 else 1 # mo parameter + ) + + tree.fit() + self.update_feature_importance(tree.get_feature_importance()) + self.align_feature_importance_guest(suffix=(epoch_idx, booster_dim)) + return tree + + def load_learner(self, model_meta, model_param, epoch_idx, booster_idx): + + flow_id = self.generate_flowid(epoch_idx, booster_idx) + runtime_idx = self.component_properties.local_partyid + host_list = self.component_properties.host_party_idlist + fast_sbt = (self.boosting_strategy == consts.MIX_TREE or self.boosting_strategy == consts.LAYERED_TREE) + tree_type, target_host_id = None, None + + if fast_sbt: + tree_type, target_host_id = self.get_tree_plan(epoch_idx) + + tree = load_hetero_tree_learner(role=self.role, tree_param=self.tree_param, model_meta=model_meta, + model_param=model_param, + flow_id=flow_id, runtime_idx=runtime_idx, host_party_list=host_list, + fast_sbt=fast_sbt, tree_type=tree_type, target_host_id=target_host_id) + + return tree + + def generate_summary(self) -> dict: + + summary = {'loss_history': self.history_loss, + 'best_iteration': self.callback_variables.best_iteration, + 'feature_importance': self.make_readable_feature_importance(self.feature_name_fid_mapping, + self.feature_importances_), + 'validation_metrics': self.callback_variables.validation_summary, + 'is_converged': self.is_converged} + + return summary + + @staticmethod + def make_readable_feature_importance(fid_mapping, feature_importances): + """ + replace feature id by real feature name + """ + new_fi = {} + for id_ in feature_importances: + if isinstance(id_, tuple): + if consts.GUEST in id_[0]: + new_fi[fid_mapping[id_[1]]] = feature_importances[id_].importance + else: + new_fi[id_[0] + '_' + str(id_[1])] = feature_importances[ + id_].importance + else: + new_fi[fid_mapping[id_]] = feature_importances[id_].importance + + return new_fi + + @assert_io_num_rows_equal + def predict(self, data_inst, ret_format='std'): + + # standard format, leaf indices, raw score + assert ret_format in ['std', 'leaf', 'raw'], 'illegal ret format' + + LOGGER.info('running prediction') + cache_dataset_key = self.predict_data_cache.get_data_key(data_inst) + + processed_data = self.data_and_header_alignment(data_inst) + + # sync feature importance if host anonymous change in model migration + if not self.on_training: + self.align_feature_importance_guest('predict') + + last_round = self.predict_data_cache.predict_data_last_round(cache_dataset_key) + + self.sync_predict_round(last_round) + + rounds = len(self.boosting_model_list) // self.booster_dim + trees = [] + LOGGER.debug('round involved in prediction {}, last round is {}, data key {}' + .format(list(range(last_round, rounds)), last_round, cache_dataset_key)) + + for idx in range(last_round, rounds): + for booster_idx in range(self.booster_dim): + tree = self.load_learner(self.booster_meta, + self.boosting_model_list[idx * self.booster_dim + booster_idx], + idx, booster_idx) + trees.append(tree) + + predict_cache = None + tree_num = len(trees) + + if last_round != 0: + predict_cache = self.predict_data_cache.predict_data_at(cache_dataset_key, min(rounds, last_round)) + LOGGER.info('load predict cache of round {}'.format(min(rounds, last_round))) + + if tree_num == 0 and predict_cache is not None and not (ret_format == 'leaf'): + return self.score_to_predict_result(data_inst, predict_cache) + + if self.boosting_strategy == consts.MIX_TREE: + predict_rs = mix_sbt_guest_predict( + processed_data, + self.hetero_sbt_transfer_variable, + trees, + self.learning_rate, + self.init_score, + self.booster_dim, + predict_cache, + pred_leaf=( + ret_format == 'leaf')) + else: + if self.EINI_inference and not self.on_training: # EINI is for inference stage + sitename = self.role + ':' + str(self.component_properties.local_partyid) + predict_rs = EINI_guest_predict( + processed_data, + trees, + self.learning_rate, + self.init_score, + self.booster_dim, + self.encrypt_param.key_length, + self.hetero_sbt_transfer_variable, + sitename, + self.component_properties.host_party_idlist, + predict_cache, + False) + else: + predict_rs = sbt_guest_predict( + processed_data, + self.hetero_sbt_transfer_variable, + trees, + self.learning_rate, + self.init_score, + self.booster_dim, + predict_cache, + pred_leaf=( + ret_format == 'leaf')) + + if ret_format == 'leaf': + return predict_rs # predict result is leaf position + + self.predict_data_cache.add_data(cache_dataset_key, predict_rs, cur_boosting_round=rounds) + LOGGER.debug('adding predict rs {}'.format(predict_rs)) + LOGGER.debug('last round is {}'.format(self.predict_data_cache.predict_data_last_round(cache_dataset_key))) + + if ret_format == 'raw': + return predict_rs + else: + return self.score_to_predict_result(data_inst, predict_rs) + + def load_feature_importance(self, feat_importance_param): + param = list(feat_importance_param) + rs_dict = {} + for fp in param: + if consts.GUEST in fp.sitename: + key = (fp.sitename.replace(':', '_'), fp.fid) # guest format + else: + sitename = fp.sitename.replace(':', '_') + anonymous_feat_name = Anonymous().get_suffix_from_anonymous_column(fp.fullname) + key = (sitename, anonymous_feat_name) + importance = FeatureImportance() + importance.from_protobuf(fp) + rs_dict[key] = importance + + self.feature_importances_ = rs_dict + + def get_model_meta(self): + model_meta = BoostingTreeModelMeta() + model_meta.tree_meta.CopyFrom(self.booster_meta) + model_meta.learning_rate = self.learning_rate + model_meta.num_trees = self.boosting_round + model_meta.quantile_meta.CopyFrom(QuantileMeta(bin_num=self.bin_num)) + model_meta.objective_meta.CopyFrom(ObjectiveMeta(objective=self.objective_param.objective, + param=self.objective_param.params)) + model_meta.use_missing = self.use_missing + model_meta.zero_as_missing = self.zero_as_missing + model_meta.task_type = self.task_type + model_meta.n_iter_no_change = self.n_iter_no_change + model_meta.tol = self.tol + model_meta.boosting_strategy = self.boosting_strategy + model_meta.module = "HeteroSecureBoost" + meta_name = consts.HETERO_SBT_GUEST_MODEL + "Meta" + + return meta_name, model_meta + + def get_model_param(self): + + model_param = BoostingTreeModelParam() + model_param.tree_num = len(self.boosting_model_list) + model_param.tree_dim = self.booster_dim + model_param.trees_.extend(self.boosting_model_list) + model_param.init_score.extend(self.init_score) + model_param.losses.extend(self.history_loss) + model_param.classes_.extend(map(str, self.classes_)) + model_param.num_classes = self.num_classes + if self.boosting_strategy == consts.STD_TREE: + model_param.model_name = consts.HETERO_SBT + elif self.boosting_strategy == consts.LAYERED_TREE: + model_param.model_name = consts.HETERO_FAST_SBT_LAYERED + elif self.boosting_strategy == consts.MIX_TREE: + model_param.model_name = consts.HETERO_FAST_SBT_MIX + model_param.best_iteration = self.callback_variables.best_iteration + + feature_importances = list(self.feature_importances_.items()) + feature_importances = sorted(feature_importances, key=itemgetter(1), reverse=True) + feature_importance_param = [] + + for (sitename, fid), importance in feature_importances: + if consts.GUEST in sitename: + fullname = self.feature_name_fid_mapping[fid] + else: + fullname = sitename + '_' + str(fid) + sitename = sitename.replace('_', ':') + fid = None + feature_importance_param.append(FeatureImportanceInfo(sitename=sitename, # sitename to distinguish sites + fid=fid, + importance=importance.importance, + fullname=fullname, + importance2=importance.importance_2, + main=importance.main_type + )) + model_param.feature_importances.extend(feature_importance_param) + model_param.feature_name_fid_mapping.update(self.feature_name_fid_mapping) + model_param.tree_plan.extend(plan.encode_plan(self.tree_plan)) + param_name = consts.HETERO_SBT_GUEST_MODEL + "Param" + + return param_name, model_param + + def set_model_meta(self, model_meta): + + if not self.is_warm_start: + # these hyper parameters are not needed in warm start setting + self.boosting_round = model_meta.num_trees + self.tol = model_meta.tol + self.n_iter_no_change = model_meta.n_iter_no_change + self.bin_num = model_meta.quantile_meta.bin_num + + self.learning_rate = model_meta.learning_rate + self.booster_meta = model_meta.tree_meta + self.objective_param.objective = model_meta.objective_meta.objective + self.objective_param.params = list(model_meta.objective_meta.param) + self.task_type = model_meta.task_type + self.boosting_strategy = model_meta.boosting_strategy + + def set_model_param(self, model_param): + + self.boosting_model_list = list(model_param.trees_) + self.init_score = np.array(list(model_param.init_score)) + self.history_loss = list(model_param.losses) + self.classes_ = list(map(int, model_param.classes_)) + self.booster_dim = model_param.tree_dim + self.num_classes = model_param.num_classes + self.feature_name_fid_mapping.update(model_param.feature_name_fid_mapping) + self.load_feature_importance(model_param.feature_importances) + # initialize loss function + self.loss = self.get_loss_function() + # init model tree plan if it exists + self.tree_plan = plan.decode_plan(model_param.tree_plan) + + +if __name__ == '__main__': + HeteroSecureBoostingTreeGuest() diff --git a/python/federatedml/ensemble/secureboost/steteless_hetero_secureboost/stateless_hetero_secureboost_host.py b/python/federatedml/ensemble/secureboost/steteless_hetero_secureboost/stateless_hetero_secureboost_host.py new file mode 100644 index 0000000000..f85117869f --- /dev/null +++ b/python/federatedml/ensemble/secureboost/steteless_hetero_secureboost/stateless_hetero_secureboost_host.py @@ -0,0 +1,293 @@ +from operator import itemgetter +from federatedml.util import LOGGER +from federatedml.util import consts +from federatedml.util.io_check import assert_io_num_rows_equal +from federatedml.protobuf.generated.boosting_tree_model_param_pb2 import FeatureImportanceInfo +from federatedml.ensemble.basic_algorithms.decision_tree.tree_core.feature_importance import FeatureImportance +from federatedml.ensemble.boosting import HeteroBoostingHost +from federatedml.param.boosting_param import HeteroSecureBoostParam, DecisionTreeParam +from federatedml.transfer_variable.transfer_class.hetero_secure_boosting_predict_transfer_variable import \ + HeteroSecureBoostTransferVariable +from federatedml.ensemble.secureboost.secureboost_util.tree_model_io import produce_hetero_tree_learner, \ + load_hetero_tree_learner +from federatedml.ensemble.secureboost.secureboost_util.boosting_tree_predict import sbt_host_predict, \ + mix_sbt_host_predict, EINI_host_predict +from federatedml.protobuf.generated.boosting_tree_model_meta_pb2 import BoostingTreeModelMeta +from federatedml.protobuf.generated.boosting_tree_model_meta_pb2 import QuantileMeta +from federatedml.protobuf.generated.boosting_tree_model_param_pb2 import BoostingTreeModelParam +from federatedml.ensemble.basic_algorithms.decision_tree.tree_core import tree_plan as plan +from federatedml.util.anonymous_generator_util import Anonymous + + +class HeteroSecureBoostingTreeHost(HeteroBoostingHost): + + def __init__(self): + super(HeteroSecureBoostingTreeHost, self).__init__() + self.use_missing = False + self.zero_as_missing = False + self.grad_and_hess = None + self.tree_param = DecisionTreeParam() # decision tree param + self.model_param = HeteroSecureBoostParam() + self.complete_secure = False + self.model_name = 'HeteroSecureBoost' + self.enable_goss = False + self.cipher_compressing = False + self.max_sample_weight = None + self.round_decimal = None + self.new_ver = True + self.feature_importance_aligned = False + + self.boosting_strategy = consts.STD_TREE + + # fast sbt param + self.tree_num_per_party = 1 + self.guest_depth = 0 + self.host_depth = 0 + self.init_tree_plan = False + self.tree_plan = [] + self.feature_importances_ = {} + + # EINI predict param + self.EINI_inference = False + self.EINI_random_mask = False + self.EINI_complexity_check = False + + self.multi_mode = consts.SINGLE_OUTPUT + + self.hetero_sbt_transfer_variable = HeteroSecureBoostTransferVariable() + + def _init_model(self, param: HeteroSecureBoostParam): + + super(HeteroSecureBoostingTreeHost, self)._init_model(param) + self.tree_param = param.tree_param + self.use_missing = param.use_missing + self.enable_goss = param.run_goss + self.zero_as_missing = param.zero_as_missing + self.complete_secure = param.complete_secure + self.sparse_opt_para = param.sparse_optimization + self.cipher_compressing = param.cipher_compress + self.new_ver = param.new_ver + + self.tree_num_per_party = param.tree_num_per_party + self.boosting_strategy = param.boosting_strategy + self.guest_depth = param.guest_depth + self.host_depth = param.host_depth + self.multi_mode = param.multi_mode + self.EINI_inference = param.EINI_inference + self.EINI_random_mask = param.EINI_random_mask + self.EINI_complexity_check = param.EINI_complexity_check + + if self.use_missing: + self.tree_param.use_missing = self.use_missing + self.tree_param.zero_as_missing = self.zero_as_missing + + def get_tree_plan(self, idx): + + if not self.init_tree_plan: + tree_plan = plan.create_tree_plan(self.boosting_strategy, k=self.tree_num_per_party, + tree_num=self.boosting_round, + host_list=self.component_properties.host_party_idlist, + complete_secure=self.complete_secure) + self.tree_plan += tree_plan + self.init_tree_plan = True + + LOGGER.info('tree plan is {}'.format(self.tree_plan)) + return self.tree_plan[idx] + + def update_feature_importance(self, tree_feature_importance): + for fid in tree_feature_importance: + if fid not in self.feature_importances_: + self.feature_importances_[fid] = tree_feature_importance[fid] + else: + self.feature_importances_[fid] += tree_feature_importance[fid] + + def load_feature_importance(self, feat_importance_param): + param = list(feat_importance_param) + rs_dict = {} + for fp in param: + key = fp.fid + importance = FeatureImportance() + importance.from_protobuf(fp) + rs_dict[key] = importance + + self.feature_importances_ = rs_dict + + def get_anonymous_importance(self): + new_feat_importance = {} + for key in self.feature_importances_: + anonymous_name = self.anonymous_header[self.feature_name_fid_mapping[key]] + party = Anonymous.get_role_from_anonymous_column(anonymous_name) + party_id = Anonymous.get_party_id_from_anonymous_column(anonymous_name) + anonymous_feat = Anonymous.get_suffix_from_anonymous_column(anonymous_name) + new_feat_importance[(party + '_' + party_id, anonymous_feat)] = self.feature_importances_[key] + return new_feat_importance + + def align_feature_importance_host(self, suffix): + """ + send feature importance to guest to update global feature importance + """ + new_feat_importance = self.get_anonymous_importance() + self.hetero_sbt_transfer_variable.host_feature_importance.remote(new_feat_importance, suffix=suffix) + + def preprocess(self): + if self.multi_mode == consts.MULTI_OUTPUT: + self.booster_dim = 1 + + def postprocess(self): + pass + + def fit_a_learner(self, epoch_idx: int, booster_dim: int): + + flow_id = self.generate_flowid(epoch_idx, booster_dim) + complete_secure = True if (epoch_idx == 0 and self.complete_secure) else False + fast_sbt = (self.boosting_strategy == consts.MIX_TREE or self.boosting_strategy == consts.LAYERED_TREE) + + tree_type, target_host_id = None, None + if fast_sbt: + tree_type, target_host_id = self.get_tree_plan(epoch_idx) + + tree = produce_hetero_tree_learner(role=self.role, tree_param=self.tree_param, flow_id=flow_id, + data_bin=self.data_bin, bin_split_points=self.bin_split_points, + bin_sparse_points=self.bin_sparse_points, task_type=self.task_type, + valid_features=self.sample_valid_features(), + host_party_list=self.component_properties.host_party_idlist, + runtime_idx=self.component_properties.local_partyid, + cipher_compress=self.cipher_compressing, + complete_secure=complete_secure, + fast_sbt=fast_sbt, tree_type=tree_type, target_host_id=target_host_id, + guest_depth=self.guest_depth, host_depth=self.host_depth, + mo_tree=(self.multi_mode == consts.MULTI_OUTPUT), bin_num=self.bin_num + ) + tree.fit() + self.update_feature_importance(tree.get_feature_importance()) + self.align_feature_importance_host(suffix=(epoch_idx, booster_dim)) + return tree + + def load_learner(self, model_meta, model_param, epoch_idx, booster_idx): + + flow_id = self.generate_flowid(epoch_idx, booster_idx) + runtime_idx = self.component_properties.local_partyid + fast_sbt = (self.boosting_strategy == consts.MIX_TREE or self.boosting_strategy == consts.LAYERED_TREE) + tree_type, target_host_id = None, None + + if fast_sbt: + tree_type, target_host_id = self.get_tree_plan(epoch_idx) + + tree = load_hetero_tree_learner(self.role, self.tree_param, model_meta, model_param, flow_id, + runtime_idx, + fast_sbt=fast_sbt, tree_type=tree_type, target_host_id=target_host_id) + + return tree + + def generate_summary(self) -> dict: + + summary = {'best_iteration': self.callback_variables.best_iteration, 'is_converged': self.is_converged} + LOGGER.debug('summary is {}'.format(summary)) + + return summary + + @assert_io_num_rows_equal + def predict(self, data_inst): + + LOGGER.info('running prediction') + + processed_data = self.data_and_header_alignment(data_inst) + + self.set_anonymous_header(processed_data) + + # sync feature importance if host anonymous change in model migration + if not self.on_training: + self.align_feature_importance_host('predict') + + predict_start_round = self.sync_predict_start_round() + + rounds = len(self.boosting_model_list) // self.booster_dim + trees = [] + for idx in range(predict_start_round, rounds): + for booster_idx in range(self.booster_dim): + tree = self.load_learner(self.booster_meta, + self.boosting_model_list[idx * self.booster_dim + booster_idx], + idx, booster_idx) + trees.append(tree) + + if len(trees) == 0: + LOGGER.info('no tree for predicting, prediction done') + return + + if self.boosting_strategy == consts.MIX_TREE: + mix_sbt_host_predict(processed_data, self.hetero_sbt_transfer_variable, trees) + else: + if self.EINI_inference and not self.on_training: + sitename = self.role + ':' + str(self.component_properties.local_partyid) + EINI_host_predict(processed_data, trees, sitename, self.component_properties.local_partyid, + self.component_properties.host_party_idlist, self.booster_dim, + self.hetero_sbt_transfer_variable, self.EINI_complexity_check, self.EINI_random_mask,) + else: + sbt_host_predict(processed_data, self.hetero_sbt_transfer_variable, trees) + + def get_model_meta(self): + model_meta = BoostingTreeModelMeta() + model_meta.tree_meta.CopyFrom(self.booster_meta) + model_meta.num_trees = self.boosting_round + model_meta.quantile_meta.CopyFrom(QuantileMeta(bin_num=self.bin_num)) + model_meta.boosting_strategy = self.boosting_strategy + model_meta.module = "HeteroSecureBoost" + meta_name = "HeteroSecureBoostingTreeHostMeta" + return meta_name, model_meta + + def get_model_param(self): + + model_param = BoostingTreeModelParam() + model_param.tree_num = len(self.boosting_model_list) + model_param.tree_dim = self.booster_dim + model_param.trees_.extend(self.boosting_model_list) + + anonymous_name_mapping = {} + + for fid, name in self.feature_name_fid_mapping.items(): + anonymous_name_mapping[self.anonymous_header[name]] = name + + model_param.anonymous_name_mapping.update(anonymous_name_mapping) + model_param.feature_name_fid_mapping.update(self.feature_name_fid_mapping) + if self.boosting_strategy == consts.STD_TREE: + model_param.model_name = consts.HETERO_SBT + elif self.boosting_strategy == consts.LAYERED_TREE: + model_param.model_name = consts.HETERO_FAST_SBT_LAYERED + elif self.boosting_strategy == consts.MIX_TREE: + model_param.model_name = consts.HETERO_FAST_SBT_MIX + model_param.best_iteration = self.callback_variables.best_iteration + model_param.tree_plan.extend(plan.encode_plan(self.tree_plan)) + + feature_importances = list(self.feature_importances_.items()) + feature_importances = sorted(feature_importances, key=itemgetter(1), reverse=True) + feature_importance_param = [] + for fid, importance in feature_importances: + feature_importance_param.append(FeatureImportanceInfo(sitename=consts.HOST_LOCAL, + fid=fid, + importance=importance.importance, + fullname=self.feature_name_fid_mapping[fid], + main=importance.main_type + )) + model_param.feature_importances.extend(feature_importance_param) + + param_name = "HeteroSecureBoostingTreeHostParam" + + return param_name, model_param + + def set_model_meta(self, model_meta): + if not self.is_warm_start: + self.boosting_round = model_meta.num_trees + self.booster_meta = model_meta.tree_meta + self.bin_num = model_meta.quantile_meta.bin_num + self.boosting_strategy = model_meta.boosting_strategy + + def set_model_param(self, model_param): + self.boosting_model_list = list(model_param.trees_) + self.booster_dim = model_param.tree_dim + self.feature_name_fid_mapping.update(model_param.feature_name_fid_mapping) + self.tree_plan = plan.decode_plan(model_param.tree_plan) + self.load_feature_importance(model_param.feature_importances) + + # implement abstract function + def check_label(self, *args): + pass diff --git a/python/federatedml/secureprotol/ecc/sm2.py b/python/federatedml/secureprotol/ecc/sm2.py index de89fcc1bb..e55cde6653 100644 --- a/python/federatedml/secureprotol/ecc/sm2.py +++ b/python/federatedml/secureprotol/ecc/sm2.py @@ -61,5 +61,6 @@ def diffie_hellman(self, ciphertext): def get_private_key(self): return self.private_key +''' From c617942ad386e10e2b4561a19113a88ca4c477e9 Mon Sep 17 00:00:00 2001 From: circlekkk Date: Tue, 19 Dec 2023 17:05:15 +0800 Subject: [PATCH 9/9] =?UTF-8?q?=E6=97=A0=E7=8A=B6=E6=80=81=E7=BA=B5?= =?UTF-8?q?=E5=90=91xgbguest=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: circlekkk --- .../stateless_hetero_secureboost_guest.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/python/federatedml/ensemble/secureboost/steteless_hetero_secureboost/stateless_hetero_secureboost_guest.py b/python/federatedml/ensemble/secureboost/steteless_hetero_secureboost/stateless_hetero_secureboost_guest.py index 485b798b33..757782f9c8 100644 --- a/python/federatedml/ensemble/secureboost/steteless_hetero_secureboost/stateless_hetero_secureboost_guest.py +++ b/python/federatedml/ensemble/secureboost/steteless_hetero_secureboost/stateless_hetero_secureboost_guest.py @@ -71,8 +71,8 @@ def _init_model(self, param: HeteroSecureBoostParam): self.tree_param = param.tree_param self.use_missing = param.use_missing self.zero_as_missing = param.zero_as_missing - self.complete_secure = param.complete_secure - self.enable_goss = param.run_goss + self.complete_secure = param.complete_secure#?False + self.enable_goss = param.run_goss#高斯采样?False self.top_rate = param.top_rate self.other_rate = param.other_rate self.cipher_compressing = param.cipher_compress @@ -93,7 +93,12 @@ def _init_model(self, param: HeteroSecureBoostParam): self.multi_mode = param.multi_mode def process_sample_weights(self, grad_and_hess, data_with_sample_weight=None): - + """ + 这个方法的作用是处理样本权重(sample weights)。样本权重是机器学习中一个常用的概念,它表示每个样本对模型更新的影响程度。 + @param grad_and_hess: + @param data_with_sample_weight: + @return: + """ # add sample weights to gradient and hessian if data_with_sample_weight is not None: if with_weight(data_with_sample_weight):