sql >> Base de Datos >  >> RDS >> Sqlserver

Un problema de consulta SQL muy complicado

Cambié un poco su modelo de datos para intentar que sea un poco más obvio lo que está pasando.

CREATE TABLE [dbo].[Customer]
(
    [CustomerName]      VARCHAR(20)     NOT NULL,
    [CustomerLink]      VARBINARY(20)   NULL
)

CREATE TABLE [dbo].[CustomerIdentification]
(
    [CustomerName]      VARCHAR(20)     NOT NULL,
    [ID]                VARCHAR(50)     NOT NULL,
    [IDType]            VARCHAR(16)     NOT NULL
)

Y he añadido algunos datos de prueba más..

INSERT  [dbo].[Customer]
        ([CustomerName])
VALUES  ('Fred'),
        ('Bob'),
        ('Vince'),
        ('Tom'),
        ('Alice'),
        ('Matt'),
        ('Dan')

INSERT  [dbo].[CustomerIdentification]
VALUES  
        ('Fred',    'A',    'Passport'),
        ('Fred',    'A',    'SIN'),
        ('Fred',    'A',    'Drivers Licence'),
        ('Bob',     'A',    'Passport'),
        ('Bob',     'B',    'Drivers Licence'),
        ('Bob',     'C',    'Credit Card'),
        ('Vince',   'A',    'Passport'),
        ('Vince',   'B',    'SIN'),
        ('Vince',   'C',    'Credit Card'),
        ('Tom',     'A',    'Passport'),
        ('Tom',     'B',    'SIN'),
        ('Tom',     'B',    'Drivers Licence'),
        ('Alice',   'B',    'Drivers Licence'),
        ('Matt',    'X',    'Drivers Licence'),
        ('Dan',     'X',    'Drivers Licence')

¿Es esto lo que estás buscando?:

;WITH [cteNonMatchingIDs] AS (
    -- Pairs where the IDType is the same, but 
    -- name and ID don't match
    SELECT  ci3.[CustomerName] AS [CustomerName1],
            ci4.[CustomerName] AS [CustomerName2]
    FROM [dbo].[CustomerIdentification] ci3
    INNER JOIN [dbo].[CustomerIdentification] ci4
        ON ci3.[IDType] = ci4.[IDType]
    WHERE ci3.[CustomerName] <> ci4.[CustomerName]
    AND ci3.[ID] <> ci4.[ID]
),
[cteMatchedPairs] AS (
    -- Pairs where the IDType and ID match, and
    -- there aren't any non matching IDs for the
    -- CustomerName
    SELECT DISTINCT 
            ci1.[CustomerName] AS [CustomerName1],
            ci2.[CustomerName] AS [CustomerName2]
    FROM [dbo].[CustomerIdentification] ci1
    LEFT JOIN [dbo].[CustomerIdentification] ci2
        ON ci1.[CustomerName] <> ci2.[CustomerName]
        AND ci1.[IDType] = ci2.[IDType] 
    WHERE ci1.[ID] = ISNULL(ci2.[ID], ci1.[ID])
    AND NOT EXISTS (
        SELECT 1
        FROM [cteNonMatchingIDs]
        WHERE ci1.[CustomerName] = [CustomerName1] -- correlated subquery
        AND ci2.[CustomerName] = [CustomerName2]
    )
    AND ci1.[CustomerName] < ci2.[CustomerName]
),
[cteMatchedList] ([CustomerName], [CustomerNameList]) AS (
    -- Turn the matched pairs into list of matching
    -- CustomerNames
    SELECT  [CustomerName1],
            [CustomerNameList]
    FROM (
        SELECT  [CustomerName1],
                CONVERT(VARCHAR(1000), '$'
                 + [CustomerName1] + '$'
                 + [CustomerName2]) AS [CustomerNameList]
        FROM [cteMatchedPairs]
        UNION ALL
        SELECT  [CustomerName2],
                CONVERT(VARCHAR(1000), '$'
                 + [CustomerName2]) AS [CustomerNameList]
        FROM [cteMatchedPairs]
    ) [cteMatchedPairs]
    UNION ALL
    SELECT  [cteMatchedList].[CustomerName],
            CONVERT(VARCHAR(1000),[CustomerNameList] + '$'
             + [cteMatchedPairs].[CustomerName2])
    FROM [cteMatchedList] -- recursive CTE
    INNER JOIN [cteMatchedPairs]
        ON RIGHT([cteMatchedList].[CustomerNameList],
         LEN([cteMatchedPairs].[CustomerName1])
        ) = [cteMatchedPairs].[CustomerName1]
),
[cteSubstringLists] AS (
    SELECT  r1.[CustomerName],
            r2.[CustomerNameList]
    FROM [cteMatchedList] r1
    INNER JOIN [cteMatchedList] r2
        ON r2.[CustomerNameList] LIKE '%' + r1.[CustomerNameList] + '%'
),
[cteCustomerLink] AS (
    SELECT DISTINCT 
            x1.[CustomerName],
            HASHBYTES('SHA1', x2.[CustomerNameList]) AS [CustomerLink]
    FROM (
        SELECT  [CustomerName],
                MAX(LEN([CustomerNameList])) AS [MAX LEN CustomerList]
        FROM [cteSubstringLists]
        GROUP BY [CustomerName]
    ) x1
    INNER JOIN (
        SELECT  [CustomerName],
                LEN([CustomerNameList]) AS [LEN CustomerList], 
                [CustomerNameList]
        FROM [cteSubstringLists]
    ) x2
        ON x1.[MAX LEN CustomerList] = x2.[LEN CustomerList]
        AND x1.[CustomerName] = x2.[CustomerName]
)
UPDATE  c
SET     [CustomerLink] = cl.[CustomerLink]
FROM [dbo].[Customer] c
INNER JOIN [cteCustomerLink] cl
    ON cl.[CustomerName] = c.[CustomerName]


SELECT *
FROM [dbo].[Customer]