SQL Server – Remove Punctuation, Special Chars, and Compare Columns

sql serversql-server-2008sql-server-2012

I have a table with new and old long description columns and I want to compare them by removing space, punctuation marks, and special characters for differences.

Create table DESCR_COMP
(
HCPCS Char(5) not null,
SEQNUM Char(4) not null,
NEW_LONG_DESC char(1000) Not NUll,
OLD_LONG_DESC char(1000) Not NUll,
SHORT_DESC char(250) Not nUll,
ADD_DT DATE,
ACT_EFF_DT DATE,
TERM_DT DATE,
CONSTRAINT PK_HCPCS primary key clustered
(
HCPCS ASC
)
)

Here are the values. total number of rows is 644 but below is the first few records

insert into Z64338_DESCR_COMP select 'A0120', '0010', 'Non-emergency transportation: mini-bus, mountain area transports, or other transportation systems', 'Non-emergency transportation: mini-bus, mountain area transports, or other  transportation systems', 'Noner transport mini-bus', '19820101', '20030101', ''
insert into Z64338_DESCR_COMP select 'A0130', '0010', 'Non-emergency transportation: wheelchair van', 'Non-emergency transportation: wheel-chair van', 'Noner transport wheelch van', '19820101', '', ''
insert into Z64338_DESCR_COMP select 'A0170', '0010', 'Transportation ancillary: parking fees, tolls, other', 'Transportation ancillary:  parking fees, tolls, other', 'Transport parking fees/tolls', '19820101', '20030101', ''
insert into Z64338_DESCR_COMP select 'A0180', '0010', 'Non-emergency transportation: ancillary: lodging-recipient', 'Non-emergency transportation: ancillary:  lodging-recipient', 'Noner transport lodgng recip', '19820101', '', ''
insert into Z64338_DESCR_COMP select 'A0190', '0010', 'Non-emergency transportation: ancillary: meals-recipient', 'Non-emergency transportation: ancillary:  meals-recipient', 'Noner transport meals recip', '19840101', '', ''
insert into Z64338_DESCR_COMP select 'A0200', '0010', 'Non-emergency transportation: ancillary: lodging escort', 'Non-emergency transportation: ancillary:  lodging escort', 'Noner transport lodgng escrt', '19820101', '', ''

I am first trying to see if I can query the columns with the values replaced and then use it in an update statement and have the column value stored in those columns and then compare. I start with the 'NEW_' column but the query is running for 20+ mins before I had to cancel it.

When I use Maxrecursion 1000, I get ",……has been exhausted before statement completion" so using infinite recursion and I don't know if that is the issue.

I dont know to use RegEX with SQLCLR so i havent gone that route. Are there more efficient ways doing it? Any way to fix my code?

DECLARE @Pattern varchar(50) = '%[^A-Za-z0-9]%';

WITH FixBadChars AS (SELECT NEW_LONG_DESC, cast(NEW_LONG_DESC as Varchar(max)) as FixedString, 1 AS MyCounter, HCPCS
                FROM [Z64338_DESCR_COMP]
                UNION ALL
                SELECT NEW_LONG_DESC, Replace(FixedString, PatIndex(@Pattern, FixedString COLLATE Latin1_General_BIN2), '') AS FixedString, 
                    MyCounter + 1, HCPCS
                FROM FixBadChars
                WHERE FixedString COLLATE Latin1_General_BIN2 LIKE @Pattern)
SELECT NEW_LONG_DESC, FixedString, MyCounter, HCPCS
FROM FixBadChars
WHERE MyCounter = 
        (SELECT MAX(MyCounter) 
        FROM FixBadChars Fixed
        WHERE Fixed.HCPCS = FixBadChars.HCPCS)
OPTION (MAXRECURSION 0);

UPDATE FixBadChars
SET NEW_LONG_DESC = FixedString
WHERE MyCounter = 
        (SELECT MAX(MyCounter) 
        FROM FixBadChars Fixed
        WHERE Fixed.Id = FixBadChars.Id)
OPTION (MAXRECURSION 0);

Best Answer

Since you only have 644 rows, you could probably get away (performance-wise) with creating a user defined function.

Create Function [dbo].[RemoveSpecialCharacters](@Temp VarChar(1000))
Returns VarChar(1000)
AS
Begin

    Declare @KeepValues as varchar(50)
    Set @KeepValues = '%[^A-Za-z0-9]%'
    While PatIndex(@KeepValues, @Temp) > 0
        Set @Temp = Stuff(@Temp, PatIndex(@KeepValues, @Temp), 1, '')

    Return @Temp
End

Then, assuming your sample data (I purposely changed a value in the last row.)

DROP TABLE IF EXISTS DESCR_COMP;
go
CREATE TABLE DESCR_COMP (
    HCPCS CHAR(5) NOT NULL
    ,SEQNUM CHAR(4) NOT NULL
    ,NEW_LONG_DESC CHAR(1000) NOT NULL
    ,OLD_LONG_DESC CHAR(1000) NOT NULL
    ,SHORT_DESC CHAR(250) NOT NULL
    ,ADD_DT DATE
    ,ACT_EFF_DT DATE
    ,TERM_DT DATE
    ,CONSTRAINT PK_HCPCS PRIMARY KEY CLUSTERED (HCPCS ASC)
    )
insert into DESCR_COMP select 'A0120', '0010', 'Non-emergency transportation: mini-bus, mountain area transports, or other transportation systems', 'Non-emergency transportation: mini-bus, mountain area transports, or other  transportation systems', 'Noner transport mini-bus', '19820101', '20030101', ''
insert into DESCR_COMP select 'A0130', '0010', 'Non-emergency transportation: wheelchair van', 'Non-emergency transportation: wheel-chair van', 'Noner transport wheelch van', '19820101', '', ''
insert into DESCR_COMP select 'A0170', '0010', 'Transportation ancillary: parking fees, tolls, other', 'Transportation ancillary:  parking fees, tolls, other', 'Transport parking fees/tolls', '19820101', '20030101', ''
insert into DESCR_COMP select 'A0180', '0010', 'Non-emergency transportation: ancillary: lodging-recipient', 'Non-emergency transportation: ancillary:  lodging-recipient', 'Noner transport lodgng recip', '19820101', '', ''
insert into DESCR_COMP select 'A0190', '0010', 'Non-emergency transportation: ancillary: meals-recipient', 'Non-emergency transportation: ancillary:  meals-recipient', 'Noner transport meals recip', '19840101', '', ''
insert into DESCR_COMP select 'A0200', '0010', 'Non-emergency transportation: ancillary: lodging escort', 'Non-emergency transportation: ancillar:  lodging escort', 'Noner transport lodgng escrt', '19820101', '', ''
go

SELECT *
FROM DESCR_COMP
WHERE dbo.RemoveSpecialCharacters(OLD_LONG_DESC) <> dbo.RemoveSpecialCharacters(NEW_LONG_DESC)

Results:

| HCPCS | SEQNUM | NEW_LONG_DESC                                           | OLD_LONG_DESC                                           | SHORT_DESC                   | ADD_DT     | ACT_EFF_DT | TERM_DT    |
|-------|--------|---------------------------------------------------------|---------------------------------------------------------|------------------------------|------------|------------|------------|
| A0200 | 0010   | Non-emergency transportation: ancillary: lodging escort | Non-emergency transportation: ancillar:  lodging escort | Noner transport lodgng escrt | 1982-01-01 | 1900-01-01 | 1900-01-01 |