2016-10-03 3 views
0

Jaro-Winkler를 사용하여 두 문자열 간의 유사성을 계산하는 oracle pl/sql에 다음 코드가 있습니다. 우리가하려고하는 것은 두 문자열의 유사성을 기반으로하는 dup을 찾는 것입니다. 우리의 사용자 사례는 사용자가 사용자 정보를 입력 할 수 있다는 것입니다. first_name, last_name하지만 오타가 있으며 사용할 수있는 유일한 키는 first_name, last_name입니다. 사용자 식별을위한 SSN 또는 이메일과 같은 고유 식별자의 다른 부분은 없습니다. 우리의 생각은 first_name, last_name에 자체 조인을 한 다음 유사점을 얻고 그 결과에 따라 우리가 dup을 식별 할 수 있습니다.google-bigquery 또는 사용자 정의 스코어 기능이있는 amazon-redshift

그러나 사용자가 10,000 명이라도 경기를 수행하는 데 100,000,000 개의 작업이 있으며 Oracle 데이터베이스에서이 작업을 시도하며 너무 느립니다.

Google은 bigquery 또는 Amazon Red-shift를 처음 사용합니다. 데이터 세트에서 맞춤 함수를 구현하는 방법에 대한 자습서가 있습니까?

google-bigquery 또는 Amazon Red Shift에는 이미 oracle의 솔루션과 비슷한 솔루션이 있습니까?

우리의 현재 환경은이 개념 증명 을 수행하는 것이 가능하지 않으므로 구름에서이 연습을하고 싶습니다.

모든 도움에 감사드립니다. 예를
그것은 표준 SQL과 BigQuery를위한 아래

--http://www.orafaq.com/forum/t/164224/ 
CREATE OR REPLACE FUNCTION GKN_COMMON.jws -- Jaro-Winkler similarity 
    (p_string1  IN VARCHAR2, 
    p_string2  IN VARCHAR2) 
    RETURN   NUMBER 
    DETERMINISTIC 
AS 
    v_string1   VARCHAR2 (32767); 
    v_string2   VARCHAR2 (32767); 
    v_closeness  NUMBER := 0; 
    v_temp   VARCHAR2 (32767); 
    v_comp1   VARCHAR2 (32767); 
    v_comp2   VARCHAR2 (32767); 
    v_matches   NUMBER := 0; 
    v_char   VARCHAR2 (1); 
    v_transpositions NUMBER := 0; 
    v_d_jaro   NUMBER := 0; 
    v_leading   NUMBER := 0; 
    v_d_winkler  NUMBER := 0; 
    v_jws    NUMBER := 0; 
BEGIN 
    -- check for null strings: 
    IF p_string1 IS NULL OR p_string2 IS NULL THEN 
    RETURN 0; 
    END IF; 
    -- remove accents: 
    v_string1 := translate (p_string1, 
      '?S?Zs?z?AAA?A??CEEEEIIII??OOO?O?UUUUY?aaa?a??ceeeeiiii???ooo?ouuuuyy??', 
      'fSEZsezYAAAAAAECEEEEIIIIDNOOOOOOUUUUYBaaaaaaeceeeeiiiioonooooouuuuyy'); 
    v_string2 := translate (p_string2, 
      '?S?Zs?z?AAA?A??CEEEEIIII??OOO?O?UUUUY?aaa?a??ceeeeiiii???ooo?ouuuuyy??', 
      'fSEZsezYAAAAAAECEEEEIIIIDNOOOOOOUUUUYBaaaaaaeceeeeiiiioonooooouuuuyy'); 
    -- closeness: 
    v_closeness := (GREATEST (LENGTH (v_string1), LENGTH (v_string2))/2) - 1; 
    -- find matching characters and transpositions within closeness: 
    v_temp := v_string2; 
    FOR i IN 1 .. LENGTH (v_string1) LOOP 
    IF INSTR (v_temp, SUBSTR (v_string1, i, 1)) > 0 THEN 
     v_char := SUBSTR (v_string1, i, 1); 
     IF ABS (INSTR (v_string1, v_char) - INSTR (v_string2, v_char)) <= v_closeness THEN 
     v_comp1 := v_comp1 || SUBSTR (v_string1, i, 1); 
     v_temp := SUBSTR (v_temp, 1, INSTR (v_temp, SUBSTR (v_string1, i, 1)) - 1) 
       || SUBSTR (v_temp, INSTR (v_temp, SUBSTR (v_string1, i, 1)) + 1); 
     END IF; 
    END IF;  
    END LOOP; 
    v_temp := v_string1; 
    FOR i IN 1 .. LENGTH (v_string2) LOOP 
    IF INSTR (v_temp, SUBSTR (v_string2, i, 1)) > 0 THEN 
     v_char := SUBSTR (v_string2, i, 1); 
     IF ABS (INSTR (v_string2, v_char) - INSTR (v_string1, v_char)) <= v_closeness THEN 
     v_comp2 := v_comp2 || SUBSTR (v_string2, i, 1); 
     v_temp := SUBSTR (v_temp, 1, INSTR (v_temp, SUBSTR (v_string2, i, 1)) - 1) 
       || SUBSTR (v_temp, INSTR (v_temp, SUBSTR (v_string2, i, 1)) + 1); 
     END IF; 
    END IF;  
    END LOOP; 
    -- check for null strings: 
    IF v_comp1 IS NULL OR v_comp2 IS NULL THEN 
    RETURN 0; 
    END IF; 
    -- count matches and transpositions within closeness: 
    FOR i IN 1 .. LEAST (LENGTH (v_comp1), LENGTH (v_comp2)) LOOP 
    IF SUBSTR (v_comp1, i, 1) = SUBSTR (v_comp2, i, 1) THEN 
     v_matches := v_matches + 1; 
    ELSE 
     v_char := SUBSTR (v_comp1, i, 1); 
     IF ABS (INSTR (v_string1, v_char) - INSTR (v_string2, v_char)) <= v_closeness THEN 
     v_transpositions := v_transpositions + 1; 
     v_matches := v_matches + 1; 
     END IF; 
    END IF; 
    END LOOP; 
    v_transpositions := v_transpositions/2; 
    -- check for no matches: 
    IF v_matches = 0 
    THEN RETURN 0; 
    END IF; 
    -- Jaro: 
    v_d_jaro := ((v_matches/LENGTH (v_string1)) + 
       (v_matches/LENGTH (v_string2)) + 
       ((v_matches - v_transpositions)/v_matches)) 
      /3; 
    -- count matching leading characters (up to 4): 
    FOR i IN 1 .. LEAST (LENGTH (v_string1), LENGTH (v_string2), 4) LOOP 
    IF SUBSTR (v_string1, i, 1) = SUBSTR (v_string2, i, 1) THEN 
     v_leading := v_leading + 1; 
    ELSE 
     EXIT; 
    END IF; 
    END LOOP; 
    -- Winkler: 
    v_d_winkler := v_d_jaro + ((v_leading * .1) * (1 - v_d_jaro)); 
    -- Jaro-Winkler similarity rounded: 
    v_jws := ROUND (v_d_winkler * 100); 
    RETURN v_jws; 
END jws; 





WITH 
    strings AS 
     (SELECT NULL   string1, NULL  string2 FROM DUAL UNION ALL 
     SELECT 'test'  string1, NULL  string2 FROM DUAL UNION ALL 
     SELECT NULL   string1, 'test'  string2 FROM DUAL UNION ALL 
     SELECT 'CRATE'  string1, 'TRACE'  string2 FROM DUAL UNION ALL 
     SELECT 'MARTHA'  string1, 'MARHTA'  string2 FROM DUAL UNION ALL 
     SELECT 'DWAYNE'  string1, 'DUANE'  string2 FROM DUAL UNION ALL 
     SELECT 'DIXON'  string1, 'DICKSONX' string2 FROM DUAL UNION ALL 
     SELECT 'Dunningham' string1, 'Cunningham' string2 FROM DUAL UNION ALL 
     SELECT 'Abroms'  string1, 'Abrams'  string2 FROM DUAL UNION ALL 
     SELECT 'Lampley' string1, 'Campley' string2 FROM DUAL UNION ALL 
     SELECT 'Jonathon' string1, 'Jonathan' string2 FROM DUAL UNION ALL 
     SELECT 'Jeraldine' string1, 'Gerladine' string2 FROM DUAL UNION ALL 
     SELECT 'test'  string1, 'blank'  string2 FROM DUAL UNION ALL 
     SELECT 'everybody' string1, 'every'  string2 FROM DUAL UNION ALL 
     SELECT 'a'   string1, 'aaa'  string2 FROM DUAL UNION ALL 
     SELECT 'Géraldine' string1, 'Gerladine' string2 FROM DUAL UNION ALL 
     SELECT 'Jérôme'  string1, 'Jerome'  string2 FROM DUAL UNION ALL 
     SELECT 'ça'   string1, 'ca'  string2 FROM DUAL UNION ALL 
     SELECT 'Üwe'   string1, 'Uwe'  string2 FROM DUAL) 
SELECT string1, string2, 
     --UTL_MATCH.JARO_WINKLER_SIMILARITY (string1, string2) oracle_jws, 
     jws (string1, string2) my_jws 
FROM strings 
ORDER BY my_jws DESC 
+0

를 사용합니다.,. – Mookayama

+0

https://aws.amazon.com/blogs/aws/user-defined-functions-for-amazon-redshift/ – Mookayama

+0

BigQuery의 UDF 문서 : https://cloud.google.com/bigquery/sql-reference/user 정의 함수 –

답변

3

점검 (Enabling Standard SQL 확인) 우리는 구글-BigQuery에 자습서를 검색하지만 아무것도 찾을 해달라고 JS User-Defined Functions

CREATE TEMPORARY FUNCTION similariry(Name1 STRING, Name2 STRING) 
    RETURNS FLOAT64 
    LANGUAGE js AS """ 
    var _extend = function(dst) { 
     var sources = Array.prototype.slice.call(arguments, 1); 
     for (var i=0; i<sources.length; ++i) { 
     var src = sources[i]; 
     for (var p in src) { 
      if (src.hasOwnProperty(p)) dst[p] = src[p]; 
     } 
     } 
     return dst; 
    }; 

    var Levenshtein = { 
     /** 
     * Calculate levenshtein distance of the two strings. 
     * 
     * @param str1 String the first string. 
     * @param str2 String the second string. 
     * @return Integer the levenshtein distance (0 and above). 
     */ 
     get: function(str1, str2) { 
     // base cases 
     if (str1 === str2) return 0; 
     if (str1.length === 0) return str2.length; 
     if (str2.length === 0) return str1.length; 

     // two rows 
     var prevRow = new Array(str2.length + 1), 
      curCol, nextCol, i, j, tmp; 

     // initialise previous row 
     for (i=0; i<prevRow.length; ++i) { 
      prevRow[i] = i; 
     } 

     // calculate current row distance from previous row 
     for (i=0; i<str1.length; ++i) { 
      nextCol = i + 1; 

      for (j=0; j<str2.length; ++j) { 
      curCol = nextCol; 

      // substution 
      nextCol = prevRow[j] + ((str1.charAt(i) === str2.charAt(j)) ? 0 : 1); 
      // insertion 
      tmp = curCol + 1; 
      if (nextCol > tmp) { 
       nextCol = tmp; 
      } 
      // deletion 
      tmp = prevRow[j + 1] + 1; 
      if (nextCol > tmp) { 
       nextCol = tmp; 
      } 

      // copy current col value into previous (in preparation for next iteration) 
      prevRow[j] = curCol; 
      } 

      // copy last col value into previous (in preparation for next iteration) 
      prevRow[j] = nextCol; 
     } 

     return nextCol; 
     } 

    }; 

    var the_Name1; 

    try { 
     the_Name1 = decodeURI(Name1).toLowerCase(); 
    } catch (ex) { 
     the_Name1 = Name1.toLowerCase(); 
    } 

    try { 
     the_Name2 = decodeURI(Name2).toLowerCase(); 
    } catch (ex) { 
     the_Name2 = Name2.toLowerCase(); 
    } 

    return 1 - Levenshtein.get(the_Name1, the_Name2)/the_Name1.length; 

    """; 

    WITH strings AS (
    SELECT NULL   string1, NULL  string2 UNION ALL 
    SELECT 'test'  string1, NULL  string2 UNION ALL 
    SELECT NULL   string1, 'test'  string2 UNION ALL 
    SELECT 'CRATE'  string1, 'TRACE'  string2 UNION ALL 
    SELECT 'MARTHA'  string1, 'MARHTA'  string2 UNION ALL 
    SELECT 'DWAYNE'  string1, 'DUANE'  string2 UNION ALL 
    SELECT 'DIXON'  string1, 'DICKSONX' string2 UNION ALL 
    SELECT 'Dunningham' string1, 'Cunningham' string2 UNION ALL 
    SELECT 'Abroms'  string1, 'Abrams'  string2 UNION ALL 
    SELECT 'Lampley' string1, 'Campley' string2 UNION ALL 
    SELECT 'Jonathon' string1, 'Jonathan' string2 UNION ALL 
    SELECT 'Jeraldine' string1, 'Gerladine' string2 UNION ALL 
    SELECT 'test'  string1, 'blank'  string2 UNION ALL 
    SELECT 'everybody' string1, 'every'  string2 UNION ALL 
    SELECT 'a'   string1, 'aaa'  string2 UNION ALL 
    SELECT 'Géraldine' string1, 'Gerladine' string2 UNION ALL 
    SELECT 'Jérôme'  string1, 'Jerome'  string2 UNION ALL 
    SELECT 'ça'   string1, 'ca'  string2 UNION ALL 
    SELECT 'Üwe'   string1, 'Uwe'  string2 
) 
    SELECT string1, string2, similariry(string1, string2) my_sim 
    FROM strings 
    ORDER BY my_sim DESC 
관련 문제