MySQL regular expressions – regexp_count, regexp_extract

Directory

One, regexp_count

Two, regexp_extract

Three, regexp_extract_index

4. Test

1. Test data

2. Test results


When learning MySQL regular expressions, you may find that some common functions are not provided. The two most typical are: return the number of matches; return all matches at once. But we can create a custom function where we use a recursive query to do it.

1. regexp_count

drop function if exists regexp_count;
delimiter //
create function regexp_count(x text,r text, match_type varchar(5)) returns bigint
    reads sql data
    deterministic
begin
    set @ret=0;

    with recursive cte as
    (select n, regexp_substr(x,r,1,t.n,match_type) b from (select 1 n) t
      union all
     select n + 1, regexp_substr(x,r,1, n + 1,match_type) from cte
      where b is not null)
    select count(*) into @ret from cte where b is not null;

    return @ret;
end;
//
delimiter;

2. regexp_extract

drop function if exists regexp_extract;
delimiter //
create function regexp_extract(x text,r text, match_type varchar(5)) returns text charset utf8mb4
    reads sql data
    deterministic
begin
    set @ret='';

    with recursive cte as
    (select n, regexp_substr(x,r,1,t.n,match_type) b from (select 1 n) t
      union all
     select n + 1, regexp_substr(x,r,1, n + 1,match_type) from cte
      where b is not null)
    select * into @ret from
    (select convert(group_concat(b) using utf8mb4) a from cte) t
     where a is not null;

    return @ret;
end;
//
delimiter;

3. regexp_extract_index

drop function if exists regexp_extract_index;
delimiter //
create function regexp_extract_index(x text,r text, return_option int, match_type varchar(5)) returns text charset utf8mb4
    reads sql data
    deterministic
begin
    set @ret='';
 
    with recursive cte as
    (select n, regexp_instr(x,r,1,t.n,return_option,match_type) b from (select 1 n) t
      union all
     select n + 1, regexp_instr(x,r,1, n + 1, return_option, match_type) from cte
      where b > 0)
    select * into @ret from
    (select convert(group_concat(b) using utf8mb4) a from cte where b >0) t;
 
    return @ret;
end;
//
delimiter;

4. Test

1. Test data

drop table if exists t_regexp;
create table t_regexp(a text);
insert into t_regexp values (
'THE RIME OF THE ANCYENT MARINERE, IN SEVEN PARTS.
ARGUMENT.
How a Ship having passed the Line was driven by Storms to the cold Country
towards the South Pole; and how from thence she made her course to the tropical
Latitude of the Great Pacific Ocean; and of the strange things that befell;
And in what manner the Ancyent Marine came back to his own Country.
I.
1 It is an ancyent Marinere,
2 And he stoppeth one of three:
3 "By thy long gray beard and thy glittering eye
4 "Now wherefore stoppest me?');
 
insert into t_regexp values ('THE RIME OF THE ANCYENT MARINERE, IN SEVEN PARTS.'),
('ARGUMENT.'),
('How a Ship having passed the Line was driven by Storms to the cold Country'),
('towards the South Pole; and how from thence she made her course to the tropical'),
('Latitude of the Great Pacific Ocean; and of the strange things that befell;'),
('and in what manner the Ancyent Marinere came back to his own Country.'),
('I.'),
('1 It is an ancyent Marinere,'),
('2 And he stoppeth one of three:'),
('3 "By thy long gray beard and thy glittering eye'),
('4 "Now wherefore stoppest me?');

2. Test result

mysql> -- Match the word the, count the number of occurrences
mysql> select regexp_extract(a,'\bthe\b','') a,regexp_count(a,'\bthe\b',' ') c from t_regexp;
 + ------------------------------------- + ------ +
| a | c |
 + ------------------------------------- + ------ +
| THE, THE, the, the, the, the, the, the, the | 9 |
| THE,THE | 2 |
| | 0 |
| the, the | 2 |
| the, the | 2 |
| the, the | 2 |
| the | 1 |
| | 0 |
| | 0 |
| | 0 |
| | 0 |
| | 0 |
 + ------------------------------------- + ------ +
12 rows in set (0.00 sec)

mysql> -- Match the lowercase word the, and count the number of occurrences
mysql> select regexp_extract(a,'\bthe\b','c') a,regexp_count(a,'\bthe\b',\ 'c') c from t_regexp;
 + ----------------------------- + ------ +
| a | c |
 + ----------------------------- + ------ +
| the, the, the, the, the, the, the | 7 |
| | 0 |
| | 0 |
| the, the | 2 |
| the, the | 2 |
| the, the | 2 |
| the | 1 |
| | 0 |
| | 0 |
| | 0 |
| | 0 |
| | 0 |
 + ----------------------------- + ------ +
12 rows in set (0.00 sec)

mysql> -- All words matched by multi-line pattern, count the number of words
mysql> select regexp_extract(a,'\w + ','') a, regexp_count(a,'\w + ','m') c from t_regexp;
 + ------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------+------ +
| a | c |
 + ------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------+------ +
| THE,RIME,OF,THE,ANCYENT,MARINERE,IN,SEVEN,PARTS,ARGUMENT,How,a,Ship,having,passed,the,Line,was,driven,by,Storms,to,the,cold,Country ,towards,the,South,Pole,and,how,from,thence,she,made,her,course,to,the,tropical,Latitude,of,the,Great,Pacific,Ocean,and,of,the,strange ,things, that, befell, and, in, what, manner, the, Ancyent, Marinere, came, back, to, his, own, Country, I, 1, It, is, an, ancyent, Marinere, 2, And ,he,stoppeth,one,of,three,3,By,thy,long,grey,beard,and,thy,glittering,eye,4,Now,wherefore,stoppest,me | 95 |
| THE,RIME,OF,THE,ANCYENT,MARINERE,IN,SEVEN,PARTS | 9 |
| ARGUMENT | 1 |
| How, a, Ship, having, passed, the, Line, was, driven, by, Storms, to, the, cold, Country | 15 |
| towards, the, South, Pole, and, how, from, thence, she, made, her, course, to, the, tropical | 15 |
| Latitude, of, the, Great, Pacific, Ocean, and, of, the, strange, things, that, befell | 13 |
| and, in, what, manner, the, Ancyent, Marinere, came, back, to, his, own, Country | 13 |
| I | 1 |
| 1,It,is,an,ancyent,Marinere | 6 |
| 2, And, he, stoppeth, one, of, three | 7 |
| 3,By,thy,long,grey,beard,and,thy,glittering,eye | 10 |
| 4, Now, wherefore, stoppest, me | 5 |
 + ------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------+------ +
12 rows in set (0.01 sec)

mysql> -- Case-sensitive, multi-line, dotall pattern matching 999, count the number of occurrences
mysql> select regexp_extract(a,'999','') a,regexp_count(a,'999','cmn') c from t_regexp;
 + ------ + ------+
| a | c |
 + ------ + ------+
| | 0 |
| | 0 |
| | 0 |
| | 0 |
| | 0 |
| | 0 |
| | 0 |
| | 0 |
| | 0 |
| | 0 |
| | 0 |
| | 0 |
 + ------ + ------+
12 rows in set (0.00 sec)

mysql> -- returns the occurrence of the word the
mysql> select regexp_extract_index(a,'\bthe\b',0,'') a from t_regexp;
 + ------------------------------------+
| a |
 + ------------------------------------+
| 1,13,86,119,144,203,228,260,311 |
| 1,13 |
| NULL |
| 26,59 |
| 9,68 |
| 13,45 |
| 20 |
| NULL |
| NULL |
| NULL |
| NULL |
| NULL |
 + ------------------------------------+
12 rows in set (0.00 sec)

mysql> -- returns the position where the lowercase word the appears (after)
mysql> select regexp_extract_index(a,'\bthe\b',1,'c') a from t_regexp;
 + ----------------------------+
| a |
 + ----------------------------+
| 89,122,147,206,231,263,314 |
| NULL |
| NULL |
| 29,62 |
| 12,71 |
| 16,48 |
| 23 |
| NULL |
| NULL |
| NULL |
| NULL |
| NULL |
 + ----------------------------+
12 rows in set (0.00 sec)

mysql> -- Multi-line pattern matching, returns the position of all words
mysql> select regexp_extract_index(a,'\w + ',0,'m') a from t_regexp;
 + ------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------------- ----------- +
| a |
 + ------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------------- ----------- +
1,5,10,13,17,25,35,38,44,51,61,65,67,72,79,86,90,95,99,106,109,116,119,123,128,136,144,148,154,160,164,168,173,180,184,1 89,193,200,203,207,216,225,228,232,238,246,253,257,260,264,272,279,284,292,296,299,304,311,315,323,332,337,342,345,349,353, 362,365,373,376,379,382,390,400,411,415,418,427,431,434,441,450,453,457,462,467,473,477,481,492,496,508,512,522,531 |
|1,5,10,13,17,25,35,38,44|
| 1 |
|1,5,7,12,19,26,30,35,39,46,49,56,59,63,68|
|1,9,13,19,25,29,33,38,45,49,54,58,65,68,72|
|1,10,13,17,23,31,38,42,45,49,57,64,69|
|1,5,8,13,20,24,32,41,46,51,54,58,62|
| 1 |
|1,9,12,15,18,26|
|1,12,16,19,28,32,35|
|1,10,13,17,22,27,33,37,41,52|
|1,13,17,27,36|
 + ------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------------- -------------------------------------------------- ----------- +
12 rows in set (0.00 sec)

mysql> -- Case-sensitive, multi-line, dotall pattern matching, returns the position where 999 appears
mysql> select regexp_extract_index(a,'999',1,'cmn') a from t_regexp;
 + ------+
| a |
 + ------+
| NULL |
| NULL |
| NULL |
| NULL |
| NULL |
| NULL |
| NULL |
| NULL |
| NULL |
| NULL |
| NULL |
| NULL |
 + ------+
12 rows in set (0.00 sec)