Showing posts with label sql. Show all posts
Showing posts with label sql. Show all posts

Thursday, September 30, 2010

BinBase - generate histograms

sql statements to generate histograms, to learn more about your data

similarity


select count("range"),"range" from
(
select
case
when match > 500 and match <= 550 then '501 - 550' when match > 550 and match <= 600 then '550 - 600' when match > 600 and match <= 650 then '600 - 650' when match > 650 and match <= 700 then '650 - 700' when match > 700 and match <= 750 then '700 - 750' when match > 750 and match <= 800 then '750 - 800' when match > 800 and match <= 850 then '800 - 850' when match > 850 and match <= 900 then '850 - 900' when match > 900 and match <= 950 then '900 - 950' when match > 950 and match <= 1000 then '950 - 1000' end AS "range" from spectra a, bin b where a.bin_id = b.bin_id and b.export = 'FALSE' ) a group by "range"


purity


select count("range"),"range" from
(
select
case
when a.purity > 0 and a.purity <= 0.1 then '0 - 0.1' when a.purity > 0.1 and a.purity <= 0.5 then '0.1 - 0.5' when a.purity > 0.5 and a.purity <= 0.9 then '0.5 - 0.9' when a.purity > 0.9 and a.purity <= 1.3 then '0.9 - 1.3' when a.purity > 1.3 and a.purity <= 1.7 then '1.3 - 1.7' when a.purity > 1.7 and a.purity <= 2.1 then '1.7 - 2.1' when a.purity > 2.1 and a.purity <= 2.5 then '2.1 - 2.5' when a.purity > 2.5 then '> 2.5'
end
AS "range" from spectra a, bin b where a.bin_id = b.bin_id and b.export = 'TRUE'
)
a group by "range"


signal noise


Tuesday, September 28, 2010

BinBase - reset database

simple sql script to reset the binbase database


delete from bin where bin_id not in (select bin_id from standard)
GO
delete from spectra where bin_id not in (select bin_id from standard)
GO
delete from correction_data
GO
delete from samples where sample_id not in (select sample_id from bin)
GO
delete from rawdata
GO
delete from result_link
GO
delete from result
GO
delete from runtime
GO
delete from quantification
GO

Monday, September 27, 2010

BinBase - simulate the sifter algorithm directly on the database

currently I'm generating statistics for the BinBase system and so needed to simulate the annotation algorithm in sql.

It was rather simple....



-- 0.05 settings
select
count(*) as "count",
lower('0 < 0.05') as purity, lower('> 500') as signalNoise,
lower('> 800') as similarity

from spectra where purity > 0 and purity < 0.05 and signal_noise > 500 and match > 800

UNION

select
count(*) as "count",
lower('0 < 0.05') as purity, lower('> 50 && < 500') as signalNoise, lower('> 700') as similarity

from spectra where purity > 0 and purity < 0.05 and signal_noise > 50 and signal_noise <= 500 and match > 700


UNION

select
count(*) as "count",
lower('0 < 0.05') as purity, lower('> 5 && < 50') as signalNoise, lower('> 600') as similarity

from spectra where purity > 0 and purity < 0.05 and signal_noise > 5 and signal_noise <= 50 and match > 600

-- 0.3 settings

UNION

select
count(*) as "count",
lower('0 < 0.3') as purity, lower('> 500') as signalNoise,
lower('> 700') as similarity

from spectra where purity > 0 and purity < 0.3 and signal_noise > 500 and match > 700

UNION

select
count(*) as "count",
lower('0 < 0.3') as purity, lower('> 50 && < 500') as signalNoise, lower('> 600') as similarity

from spectra where purity > 0 and purity < 0.3 and signal_noise > 50 and signal_noise <= 500 and match > 600

UNION

select
count(*) as "count",
lower('0 < 0.3') as purity, lower('> 5 && < 50') as signalNoise, lower('> 500') as similarity

from spectra where purity > 0 and purity < 0.03 and signal_noise > 5 and signal_noise <= 50 and match > 500

-- all ither settings

UNION

select
count(*) as "count",
lower('> 0.3') as purity,
lower('> 500') as signalNoise,
lower('> 600') as similarity

from spectra where purity > 0.3 and signal_noise > 500 and match > 600

UNION

select
count(*) as "count",
lower('> 0.3') as purity,
lower('> 50 && < 500') as signalNoise, lower('> 500') as similarity

from spectra where purity > 0.3 and signal_noise > 50 and signal_noise <= 500 and match > 500

UNION

select
count(*) as "count",
lower('> 0.3') as purity,
lower('> 5 && < 50') as signalNoise, lower('> 450') as similarity

from spectra where purity > 0.03 and signal_noise > 5 and signal_noise <= 50 and match > 450

-- total --

union

select
count(*) as "count",
lower('no filter') as purity,
lower('no filter') as signalNoise,
lower('no filter') as similarity

from spectra



leco 2.22 settings


-- 0.1 settings
select
count(*) as "count",
lower('0 < 0.1') as purity, lower('> 250') as signalNoise,
lower('> 800') as similarity

from spectra where purity > 0 and purity < 0.1 and signal_noise > 250 and match > 800

UNION

select
count(*) as "count",
lower('0 < 0.1') as purity, lower('> 25 && < 250') as signalNoise, lower('> 700') as similarity

from spectra where purity > 0 and purity < 0.1 and signal_noise > 25 and signal_noise <= 250 and match > 700


UNION

select
count(*) as "count",
lower('0 < 0.1') as purity, lower('>35 && < 25') as signalNoise, lower('> 600') as similarity

from spectra where purity > 0 and purity < 0.1 and signal_noise > 3 and signal_noise <= 25 and match > 600

-- 1.5 settings

UNION

select
count(*) as "count",
lower('0 < 1.5') as purity, lower('> 250') as signalNoise,
lower('> 700') as similarity

from spectra where purity > 0 and purity < 1.5 and signal_noise > 250 and match > 700

UNION

select
count(*) as "count",
lower('0 < 1.5') as purity, lower('> 25 && < 250') as signalNoise, lower('> 600') as similarity

from spectra where purity > 0 and purity < 1.5 and signal_noise > 25 and signal_noise <= 250 and match > 600

UNION

select
count(*) as "count",
lower('0 < 1.5') as purity, lower('> 3 && < 25') as signalNoise, lower('> 500') as similarity

from spectra where purity > 0 and purity < 1.5 and signal_noise > 3 and signal_noise <= 25 and match > 500

-- all other settings

UNION

select
count(*) as "count",
lower('> 1.5') as purity,
lower('> 250') as signalNoise,
lower('> 600') as similarity

from spectra where purity > 1.5 and signal_noise > 250 and match > 600

UNION

select
count(*) as "count",
lower('> 1.5') as purity,
lower('> 3 && < 250') as signalNoise, lower('> 500') as similarity

from spectra where purity > 1.5 and signal_noise > 3 and signal_noise <= 250 and match > 500

UNION

-- total --

select
count(*) as "count",
lower('no filter') as purity,
lower('no filter') as signalNoise,
lower('no filter') as similarity

from spectra


and as query to filter polysiloxanes out of the result for leco 2.22




-- 0.1 settings
select
count(*) as "count",
lower('0 < 0.1') as purity, lower('> 250') as signalNoise,
lower('> 800') as similarity

from spectra a, bin b where a.purity > 0 and a.purity < 0.1 and a.signal_noise > 250 and match > 800 and a.bin_id = b.bin_id and b.export = 'TRUE'

UNION

select
count(*) as "count",
lower('0 < 0.1') as purity, lower('> 25 && < 250') as signalNoise, lower('> 700') as similarity

from spectra a, bin b where a.purity > 0 and a.purity < 0.1 and a.signal_noise > 25 and a.signal_noise <= 250 and match > 700 and a.bin_id = b.bin_id and b.export = 'TRUE'


UNION

select
count(*) as "count",
lower('0 < 0.1') as purity, lower('>35 && < 25') as signalNoise, lower('> 600') as similarity

from spectra a, bin b where a.purity > 0 and a.purity < 0.1 and a.signal_noise > 3 and a.signal_noise <= 25 and match > 600 and a.bin_id = b.bin_id and b.export = 'TRUE'

-- 1.5 settings

UNION

select
count(*) as "count",
lower('0 < 1.5') as purity, lower('> 250') as signalNoise,
lower('> 700') as similarity

from spectra a, bin b where a.purity > 0 and a.purity < 1.5 and a.signal_noise > 250 and match > 700 and a.bin_id = b.bin_id and b.export = 'TRUE'

UNION

select
count(*) as "count",
lower('0 < 1.5') as purity, lower('> 25 && < 250') as signalNoise, lower('> 600') as similarity

from spectra a, bin b where a.purity > 0 and a.purity < 1.5 and a.signal_noise > 25 and a.signal_noise <= 250 and match > 600 and a.bin_id = b.bin_id and b.export = 'TRUE'

UNION

select
count(*) as "count",
lower('0 < 1.5') as purity, lower('> 3 && < 25') as signalNoise, lower('> 500') as similarity

from spectra a, bin b where a.purity > 0 and a.purity < 1.5 and a.signal_noise > 3 and a.signal_noise <= 25 and match > 500 and a.bin_id = b.bin_id and b.export = 'TRUE'

-- all other settings

UNION

select
count(*) as "count",
lower('> 1.5') as purity,
lower('> 250') as signalNoise,
lower('> 600') as similarity

from spectra a, bin b where a.purity > 1.5 and a.signal_noise > 250 and match > 600 and a.bin_id = b.bin_id and b.export = 'TRUE'

UNION

select
count(*) as "count",
lower('> 1.5') as purity,
lower('> 3 && < 250') as signalNoise, lower('> 500') as similarity

from spectra a, bin b where a.purity > 1.5 and a.signal_noise > 3 and a.signal_noise <= 250 and match > 500 and a.bin_id = b.bin_id and b.export = 'TRUE'

UNION

-- total --

select
count(*) as "count",
lower('no filter') as purity,
lower('no filter') as signalNoise,
lower('no filter') as similarity

from spectra a, bin b where a.bin_id = b.bin_id and b.export = 'TRUE'

Wednesday, September 22, 2010

BinBase queries

short overview of binbase queries to generate statistics



count of spectra

select count(*) from spectra a, samples b where a.sample_id = b.sample_id and visible = 'TRUE'

count of spectra with no annotations

select count(*) from spectra a, samples b where a.sample_id = b.sample_id and visible = 'TRUE' and bin_id is null

count of samples

select count(*) from samples where visibile = 'TRUE'

known species

select distinct value from sample_info a, meta_key b where a.key_id = b.key_id and b.key = 'Species'

count of species

select count(distinct value) from sample_info a, meta_key b where a.key_id = b.key_id and b.key = 'Species'

sample count for species

select count(a.sample_id),a.value from sample_info a, meta_key b,samples c where a.key_id = b.key_id and b.key = 'Species' and a.sample_id = c.sample_id and c.visible = 'TRUE' group by a.value

bin generation by species

select a.value,count(d.spectra_id) from sample_info a, meta_key b,samples c,bin d where a.key_id = b.key_id and b.key = 'Species' and a.sample_id = c.sample_id and d.sample_id = c.sample_id group by a.value

ri correction successful

select count(*) from samples where visible = 'TRUE' and correction_failed = 'FALSE'

ri correction successful

select count(*) from samples where visible = 'TRUE' and correction_failed = 'FALSE'

named bins

select count(bin_id) from bin where name = TO_CHAR(bin_id,'FM9999999999')

distribution of similarities


select count("range"),"range" from (
select
case
when match >= 0 and match <= 100 then '0 - 100' when match > 100 and match <= 200 then '101 - 200' when match > 200 and match <= 300 then '201 - 300' when match > 300 and match <= 400 then '301 - 400' when match > 400 and match <= 500 then '401 - 500' when match > 500 and match <= 600 then '501 - 600' when match > 600 and match <= 700 then '601 - 700' when match > 700 and match <= 800 then '701 - 800' when match > 800 and match <= 900 then '801 - 900' when match > 900 and match <= 1000 then '901 - 1000' end AS "range" from spectra ) a group by "range"

Tuesday, May 4, 2010

flushing BinBase

once in a while you think you want to reprocess all data in a binbase database and for this reason you want to flush all the content except the retention index standards. This should never be required, but if you decided to regenerate all the bins with different settings, well you gotta do what you gotta do.

here is a quick sql script todo this for you



delete from comments

go

delete from sample_info

go

delete from meta_key

go

delete from metainformation

go

delete from runtime

go

delete from bin_compare

go

delete from virtual_bin

go

delete from bin a where a.bin_id not in (select bin_id from standard)

go

delete from samples a where a.sample_id not in (select sample_id from bin)

go

delete from reference_class

go

delete from reference

go

delete from result_link

go

delete from bin_ratio

go

delete from bin_references

go

delete from classification

go

delete from configuration where configuration_id not in (select configuration_id from samples)

go

delete from spectra where sample_id not in (select sample_id from samples)

go

delete from qualitycontrol

go

delete from result

go

delete from standard_hist

go

delete from structure

go

delete from substance_classes

go

delete from synonyme

go

delete from user_validation

go

delete from substance_classes

go

Friday, March 5, 2010

Postgres - partial indexes and sort indexes

this morning it was a sad moment. Normally people ask me all the time about SQL optimization or SQL tuning or when to create an index and so on.

This morning i had another DOOOH moment.

Basically while trying to optimize BinBase even further I noticed that some queries take an absurd amount of time to finish. Like 500 seconds to query the bin table.

So I started to investigate and discovered that some of my queries are using seq scans over huge tables. For no apparent reason.

It turned out that the sort column was not indexed...


CREATE INDEX bin_retention_index ON bin (retention_index ASC NULLS LAST);


which was greatly improved after executing this statement.

Now there was still a huge wait time for a certain query


explain select * from BIN where bin_id not in ( SELECT bin_id from SPECTRA where sample_id = 1630733 and bin_id is not null )ORDER BY retention_index ASC


and it turned out that indexes were never used for the 'is not null' part. After some research and a lot of head scratching it turned out that postgres supports partial indexes for exactly this case.


create index spectra_bin_id_partial_is_not_null on spectra (bin_id) where bin_id is not null


Now afterward we some improvement but the actually slow data access is caused by something else. We always need the complete table - a few compounds and this operation takes some time.

Time to optimize the internal database lookup...

Wednesday, November 18, 2009

SQL - small mistakes, big impact

the last couple of days I was fighting with a database which became slower and slower and slower and now needed close to 24h to execute a rather simple query...

After working for hours on different parameters and tuning the database, including setting up a spare server to see if a dump and restore fixes the issues I got the time down to 5 minutes.

Sounds good?

well not really since I needed an execution in the range of a couple of seconds. So I actually looked at the sql statement which caused issues and found a gigantic error in it...

the statement:


select * from BIN where bin_id not in ( SELECT bin_id from SPECTRA where sample_id = 424105 AND SPECTRA.bin_id = BIN.bin_id)


and the correct statement would be:


select * from BIN a where a.bin_id not in ( SELECT bin_id from SPECTRA where sample_id = 424105 and bin_id is not null) ORDER BY retention_index ASC


the problem?


SPECTRA.bin_id = BIN.bin_id


this little part is totally wrong in the whole statement and causes a seq scan over a table with 70 Million entries against a table with 6000 entries. It does nothing for the queries and I guess it was left over from a copy/paste accident 6 years ago when I wrote this script originally.

let's compare some numbers:

explain before:

Seq Scan on bin (cost=0.00..459513.42 rows=3044 width=1290)
SubPlan
Filter: (NOT (subplan))
-> Bitmap Heap Scan on spectra (cost=75.20..75.42 rows=2 width=8)
Recheck Cond: ((sample_id = 424105::double precision) AND (bin_id = ($0)::double precision))
-> BitmapAnd (cost=75.20..75.20 rows=2 width=0)
-> Bitmap Index Scan on spectra_sample_id (cost=0.00..18.99 rows=5696 width=0)
-> Bitmap Index Scan on spectra_binid (cost=0.00..56.18 rows=17658 width=0)
Index Cond: (sample_id = 424105::double precision)
Index Cond: (bin_id = ($0)::double precision)

explain after:

Sort (cost=1765.47..1766.23 rows=3044 width=1290)
Sort Key: retention_index
-> Seq Scan on bin a (cost=647.21..1747.86 rows=3044 width=1290)
Filter: (NOT (hashed subplan))
SubPlan
-> Index Scan using spectra_sample_id on spectra (cost=0.00..646.42 rows=3138 width=8)
Index Cond: (sample_id = 424105::double precision)
Filter: (bin_id IS NOT NULL)


in real world number this translates to 24 hours before statement compared to 4 seconds after statement...
...

Tuesday, July 28, 2009

distinct or group by, what's more expensive

today I was asking my self about the best approach for a performance critical application.

Distinct or group by '*'



select name from table group by name



or



select distinct name from table



after all they both return the exact same result.

Some explain statements later it turned out that the group by clause is 18% faster than the distinct statement.

Monday, July 13, 2009

What's more expensive IN or BETWEEN statement

This morning I spend a couple of hours tuning and testing different SQL statements to find the fastest way to fetch some data for a real time application.

While doing this I discovered that the IN query cost slightly more than the BETWEEN query.
It's not a lot, but still something.

IN query:


explain select sample_name,class,date_of_import,date from samples where visible = 'TRUE' and sample_name like '%q%' and date in( '5/26/2009 12:00:00 AM', '6/11/2009 12:00:00 AM','5/26/2009 12:00:00 AM','2009-06-01 12:00:00 AM') order by 1 desc


result:

QUERY PLAN
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Sort (cost=8126.07..8126.42 rows=139 width=37)
Sort Key: sample_name
-> Seq Scan on samples (cost=0.00..8121.12 rows=139 width=37)
Filter: (((sample_name)::text ~~ '%q%'::text) AND ((visible)::text = 'TRUE'::text) AND (date = ANY ('{"2009-05-26 00:00:00","2009-06-11 00:00:00","2009-05-26 00:00:00","2009-06-01 00:00:00"}'::timestamp without time zone[])))

4 record(s) selected [Fetch MetaData: 0/ms] [Fetch Data: 0/ms]

[Executed: 7/13/09 1:11:46 PM PDT ] [Execution: 2/ms]





BETWEEN query:


explain select sample_name,class,date_of_import,date from samples where visible = 'TRUE' and sample_name like '%q%' and date between '5/26/2009 12:00:00 AM' and '6/11/2009 12:00:00 AM' order by 1 desc


Result:


QUERY PLAN
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Sort (cost=8123.15..8123.32 rows=67 width=37)
Sort Key: sample_name
-> Seq Scan on samples (cost=0.00..8121.12 rows=67 width=37)
Filter: (((sample_name)::text ~~ '%q%'::text) AND (date >= '2009-05-26 00:00:00'::timestamp without time zone) AND (date <= '2009-06-11 00:00:00'::timestamp without time zone) AND ((visible)::text = 'TRUE'::text))

4 record(s) selected [Fetch MetaData: 0/ms] [Fetch Data: 0/ms]

[Executed: 7/13/09 12:53:32 PM PDT ] [Execution: 2/ms]

Thursday, July 9, 2009

Hacking SetupX - working on the sql tables instead of hibernate

Currently one of my jobs is to get data out of the setupX system, since the developer left us and nobody has a clue how it actually works.

So far the basic understanding is that the system has one major table and all queries are executed by building sub queries in it.

Advantage of the technique:

it's extremely flexible,
Disadvantage?

It's ridiculous slow and a nightmare to figure out, since you got so many possible sub graphs.

Don't we love reverse engineering an existing software?

So this is just a small collection of statements to get data out.

get list of available questions and attributes


select question from formobject group by question;


get the list of registered species


select value from formobject where question = 'species' group by value;



The final idea is to make a snapshot of the database every night and transform it into a real database structure. Like having several tables and so.