Tuesday, February 18, 2014

Fixing duplicated persons in TLS207

Due to a bug in original data in DOCDB, a percentage of person ids appear twice for the same application.

I pubblish here a code for MySql but easily adaptable to other platforms, for fixing the issue.


The following code creates starting from a list of application ids (XX_appln_id usually I have all EPO applications in it); it must be broken into 3 parts cause in case of duplicates I take the minimum seq_nr (both for applicant and inventor) but in order to avoid cases where we have the same person id as inventor and applicant (so where the min would be 0) in a support table and for the final table I take the max seq_nr.



use newpatstat;

drop table if exists TLS207_pers_appln;
drop table if exists TLS207_pers_appln0;


create table TLS207_pers_appln0
SELECT  e.person_id, e.APPLN_ID, min(e.applt_seq_nr) as applt_seq_nr, invt_seq_nr 
from  patstat.TLS207_pers_appln e
inner join XX_appln_id t on t.appln_id = e .appln_id
where e.invt_seq_nr =0
group by e.person_id, e.APPLN_ID, e.invt_seq_nr ;


insert into TLS207_pers_appln0
SELECT  e.person_id, e.APPLN_ID, e.applt_seq_nr, min(e.invt_seq_nr ) as invt_seq_nr
from  patstat.TLS207_pers_appln e
inner join XX_appln_id t on t.appln_id = e .appln_id
where e.applt_seq_nr =0
group by e.person_id, e.APPLN_ID, e.applt_seq_nr ;

insert into TLS207_pers_appln0
SELECT  e.person_id, e.APPLN_ID, min(e.applt_seq_nr) as applt_seq_nr, min(e.invt_seq_nr) as invt_seq_nr 
from  patstat.TLS207_pers_appln e
inner join XX_appln_id t on t.appln_id = e .appln_id
group by e.person_id, e.APPLN_ID;


create table TLS207_pers_appln
SELECT  e.person_id, e.APPLN_ID, max(e.applt_seq_nr) as applt_seq_nr, max(e.invt_seq_nr ) as invt_seq_nr 
from  TLS207_pers_appln0 e
group by e.person_id, e.APPLN_ID;


CREATE INDEX IDX207_PERSON ON TLS207_PERS_APPLN(PERSON_ID);
CREATE INDEX IDX207_APPLN ON TLS207_PERS_APPLN(APPLN_ID);


drop table if exists tls207_chk01;
drop table if exists tls207_chk02;



-- two check tables should be empty

create table tls207_chk01
SELECT t.APPLN_ID, t.person_id, invt_seq_nr, Count(t.APPLN_ID) AS cc
FROM tls207_pers_appln t
GROUP BY t.APPLN_ID, t.person_id, invt_seq_nr
HAVING cc >  '1';

create table tls207_chk02
SELECT t.APPLN_ID, t.person_id, applt_seq_nr, Count(t.APPLN_ID) AS cc
FROM tls207_pers_appln t
GROUP BY t.APPLN_ID, t.person_id , applt_seq_nr
HAVING cc >  '1';



drop table if exists TLS207_pers_appln0;

No comments:

Post a Comment