Eratosthenes素数比同时更快?
我目前正在编写一个程序,首先依次由Eratosthenes Sieve生成素数,然后同时进行。 该algorithm的并发版本应该比顺序版本更快,但在我的情况下,并发版本是约。 慢了10倍 我想知道我在哪里把额外的工作放在我的线程上,而不是顺序解决scheme中的主线程。 这是我的程序(准备阅读一下!):
Primes.java :
public abstract class Primes { byte[] bitArr; int maxNum; final int[] BITMASK = { 1, 2, 4, 8, 16, 32, 64 }; final int[] BITMASK2 = { 255 - 1, 255 - 2, 255 - 4, 255 - 8, 255 - 16, 255 - 32, 255 - 64 }; void setAllPrime() { for (int i = 0; i < bitArr.length; i++) { bitArr[i] = (byte) 127; } } void crossOut(int i) { bitArr[i/14] = (byte) (bitArr[i/14] - BITMASK[((i/2)%7)]); } boolean isPrime(int i) { if(i == 2){ return true; } if((i%2) == 0){ return false; } return (bitArr[i/14] & BITMASK[(i%14)>>1]) != 0; } int nextPrime(int i) { int k; if ((i%2) == 0){ k =i+1; } else { k = i+2; } while (!isPrime(k) && k < maxNum){ k+=2; } return k; } void printAllPrimes() { for (int i = 2; i <= maxNum; i++){ if (isPrime(i)){ System.out.println("Prime: " + i); } } } }
PrimesSeq.java :
import java.util.ArrayList; public class PrimesSeq extends Primes{ PrimesSeq(int maxNum) { this.maxNum = maxNum; bitArr = new byte[(maxNum / 14) + 1]; setAllPrime(); generatePrimesByEratosthenes(); } void generatePrimesByEratosthenes() { crossOut(1); // 1 is not a prime int curr = 3; while(curr < Math.sqrt(maxNum)){ for(int i = curr*curr; i < maxNum; i+=2*curr){ if(isPrime(i)){ // 2*curr because odd*2 = even! crossOut(i); } } curr = nextPrime(curr); } } }
PrimesPara.java :
import java.util.ArrayList; public class PrimesPara extends Primes { PrimeThread[] threads; int processors; int currentState = 0; //0 = Init //1 = Generate primes after thread #0 finish //2 = Factorize public PrimesPara(int maxNum){ this.maxNum = maxNum; this.processors = Runtime.getRuntime().availableProcessors(); bitArr = new byte[(maxNum / 14) + 1]; setAllPrime(); this.threads = new PrimeThread[processors*2]; generateErastothenesConcurrently(); //printAllPrimes(); } public void generateErastothenesConcurrently(){ int[] starts = generateThreadIndexes(); for(int i = 0; i < threads.length; i++){ if(i != threads.length-1){ threads[i] = new PrimeThread(starts[i], starts[i+1]-1, i); } else { threads[i] = new PrimeThread(starts[i], maxNum, i); } } //Start generating the first primes crossOut(1); Thread th = new Thread(threads[0]); th.start(); try { th.join(); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } currentState = 1; //Start generating the rest of the primes Thread[] thrs = new Thread[threads.length]; for(int i = 0; i < thrs.length; i++){ thrs[i] = new Thread(threads[i]); thrs[i].start(); } for(int i = 0; i < thrs.length; i++){ try { thrs[i].join(); } catch (InterruptedException e) { e.printStackTrace(); } } currentState = 2; } private int[] generateThreadIndexes(){ int[] indexes = new int[processors*2]; for(int i = 0; i < indexes.length; i++){ indexes[i] = (i*((maxNum/(processors*2)))); } indexes[indexes.length-1]++; return indexes; } public class PrimeThread implements Runnable { int start; int end; int thridx; public PrimeThread(int start, int end, int thridx){ this.start = start; this.end = end; this.thridx = thridx; } public void run() { switch(currentState){ case 0: generateSqrtPrimes(); break; case 1: generateMyPrimes(); break; case 2: break; } } private void generateSqrtPrimes(){ int curr = 3; while(curr < Math.sqrt(maxNum)+1){ for(int i = curr*curr; i < Math.sqrt(maxNum)+1; i+=2*curr){ if(isPrime(i)){ // 2*curr because odd*2 = even! crossOut(i); } } curr = nextPrime(curr); } } private void generateMyPrimes(){ int curr = start>(int)Math.sqrt(maxNum)?start:(int)Math.sqrt(maxNum); while(curr < end){ for(int i = 3; i < Math.sqrt(maxNum)+1; i = nextPrime(i)){ if((curr%i) == 0){ if(isPrime(curr)){ crossOut(curr); } } } curr = nextPrime(curr); } } } }
如果有人能告诉我并发程序的瓶颈在哪里,我会非常高兴。 提前致谢!
我不是JAVA编码器,所以我坚持使用C ++。 此外,这不是一个直接的答案你的问题(对不起,但我不能debugging的JAVA)作为一些指针哪些方式去检查或…
-
Eratosthenes的筛子
并行化是可能的,但速度不够快。 相反,我使用更多的筛选标签,每个标签都有自己的子分区,每个表的大小是所有子分隔符的普通乘积。 这样你只需要启动一次表,然后在
O(1)
检查它们, -
并行
在检查完所有筛子后,我将使用线程对所有未使用的除数进行明显的分割testing
-
memoize的
如果你有所有find的素数的活动表,然后除以素数,并添加所有发现的新素数
我正在使用对我来说足够快的非平行主要search…
- 你可以适应这个你的并行代码…
[编辑1]更新的代码
//--------------------------------------------------------------------------- int bits(DWORD p) { DWORD m=0x80000000; int b=32; for (;m;m>>=1,b--) if (p>=m) break; return b; } //--------------------------------------------------------------------------- DWORD sqrt(const DWORD &x) { DWORD m,a; m=(bits(x)>>1); if (m) m=1<<m; else m=1; for (a=0;m;m>>=1) { a|=m; if (a*a>x) a^=m; } return a; } //--------------------------------------------------------------------------- List<int> primes_i32; // list of precomputed primes const int primes_map_sz=4106301; // max size of map for speedup search for primes max(LCM(used primes per bit)) (not >>1 because SOE is periodic at double LCM size and only odd values are stored 2/2=1) const int primes_map_N[8]={ 4106301,3905765,3585337,4026077,3386981,3460469,3340219,3974653 }; const int primes_map_i0=33; // first index of prime not used in mask const int primes_map_p0=137; // biggest prime used in mask BYTE primes_map[primes_map_sz]; // factors map for first i0-1 primes bool primes_i32_alloc=false; int isprime(int p) { int i,j,a,b,an,im[8]; BYTE u; an=0; if (!primes_i32.num) // init primes vars { primes_i32.allocate(1024*1024); primes_i32.add( 2); for (i=1;i<primes_map_sz;i++) primes_map[i]=255; primes_map[0]=254; primes_i32.add( 3); for (u=255- 1,j= 3,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u; primes_i32.add( 5); for (u=255- 2,j= 5,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u; primes_i32.add( 7); for (u=255- 4,j= 7,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u; primes_i32.add( 11); for (u=255- 8,j= 11,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u; primes_i32.add( 13); for (u=255- 16,j= 13,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u; primes_i32.add( 17); for (u=255- 32,j= 17,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u; primes_i32.add( 19); for (u=255- 64,j= 19,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u; primes_i32.add( 23); for (u=255-128,j= 23,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u; primes_i32.add( 29); for (u=255- 1,j=137,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u; primes_i32.add( 31); for (u=255- 2,j=131,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u; primes_i32.add( 37); for (u=255- 4,j=127,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u; primes_i32.add( 41); for (u=255- 8,j=113,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u; primes_i32.add( 43); for (u=255- 16,j= 83,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u; primes_i32.add( 47); for (u=255- 32,j= 61,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u; primes_i32.add( 53); for (u=255- 64,j=107,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u; primes_i32.add( 59); for (u=255-128,j=101,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u; primes_i32.add( 61); for (u=255- 1,j=103,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u; primes_i32.add( 67); for (u=255- 2,j= 67,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u; primes_i32.add( 71); for (u=255- 4,j= 37,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u; primes_i32.add( 73); for (u=255- 8,j= 41,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u; primes_i32.add( 79); for (u=255- 16,j= 43,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u; primes_i32.add( 83); for (u=255- 32,j= 47,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u; primes_i32.add( 89); for (u=255- 64,j= 53,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u; primes_i32.add( 97); for (u=255-128,j= 59,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u; primes_i32.add(101); for (u=255- 1,j= 97,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u; primes_i32.add(103); for (u=255- 2,j= 89,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u; primes_i32.add(107); for (u=255- 4,j=109,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u; primes_i32.add(109); for (u=255- 8,j= 79,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u; primes_i32.add(113); for (u=255- 16,j= 73,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u; primes_i32.add(127); for (u=255- 32,j= 71,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u; primes_i32.add(131); for (u=255- 64,j= 31,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u; primes_i32.add(137); for (u=255-128,j= 29,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u; } if (!primes_i32_alloc) { if (p <=1) return 0; // ignore too small walues if (p&1==0) return 0; // not prime if even if (p>primes_map_p0) { j=p>>1; i=j; if (i>=primes_map_N[0]) i%=primes_map_N[0]; if (!(primes_map[i]& 1)) return 0; i=j; if (i>=primes_map_N[1]) i%=primes_map_N[1]; if (!(primes_map[i]& 2)) return 0; i=j; if (i>=primes_map_N[2]) i%=primes_map_N[2]; if (!(primes_map[i]& 4)) return 0; i=j; if (i>=primes_map_N[3]) i%=primes_map_N[3]; if (!(primes_map[i]& 8)) return 0; i=j; if (i>=primes_map_N[4]) i%=primes_map_N[4]; if (!(primes_map[i]& 16)) return 0; i=j; if (i>=primes_map_N[5]) i%=primes_map_N[5]; if (!(primes_map[i]& 32)) return 0; i=j; if (i>=primes_map_N[6]) i%=primes_map_N[6]; if (!(primes_map[i]& 64)) return 0; i=j; if (i>=primes_map_N[7]) i%=primes_map_N[7]; if (!(primes_map[i]&128)) return 0; } } an=primes_i32[primes_i32.num-1]; if (an>=p) { // linear table search if (p<127) // 31st prime { if (an>=p) for (i=0;i<primes_i32.num;i++) { a=primes_i32[i]; if (a==p) return 1; if (a> p) return 0; } } // approximation table search else{ for (j=1,i=primes_i32.num-1;j<i;j<<=1); j>>=1; if (!j) j=1; for (i=0;j;j>>=1) { i|=j; if (i>=primes_i32.num) { i-=j; continue; } a=primes_i32[i]; if (a==p) return 1; if (a> p) i-=j; } return 0; } } a=an; a+=2; for (j=a>>1,i=0;i<8;i++) im[i]=j%primes_map_N[i]; an=(1<<((bits(p)>>1)+1))-1; if (an<=0) an=1; an=an+an; for (;a<=p;a+=2) { for (j=1,i=0;i<8;i++,j<<=1) // check if map is set if (!(primes_map[im[i]]&j)) { j=-1; break; } // if not dont bother with division for (i=0;i<8;i++) { im[i]++; if (im[i]>=primes_map_N[i]) im[i]-=primes_map_N[i]; } if (j<0) continue; for (i=primes_map_i0;i<primes_i32.num;i++) { b=primes_i32[i]; if (b>an) break; if ((a%b)==0) { i=-1; break; } } if (i<0) continue; primes_i32.add(a); if (a==p) return 1; if (a> p) return 0; } return 0; } //--------------------------------------------------------------------------- void getprimes(int p) // compute and allocate primes up to p { if (!primes_i32.num) isprime(3); int p0=primes_i32[primes_i32.num-1]; // biggest prime computed yet if (p>p0+10000) // if too big difference use sieves to fast precompute { // T((0.3516+0.5756*log10(n))*n) -> O(n.log(n)) // sieves N/16 bytes p=100 000 000 t=1903.031 ms // ------------------------------ // 0 1 2 3 4 5 6 7 bit // ------------------------------ // 1 3 5 7 9 11 13 15 +-> +2 // 17 19 21 23 25 27 29 31 | // 33 35 37 39 41 43 45 47 V +16 // ------------------------------ int N=(p|15),M=(N>>4); // store only odd values 1,3,5,7,... each bit ... char *m=new char[M+1]; // m[i] -> is 1+i+i prime? (factors map) int i,j,k,n; // init sieves m[0]=254; for (i=1;i<=M;i++) m[i]=255; for(n=sqrt(p),i=1;i<=n;) { int a=m[i>>4]; if (int(a& 1)!=0) for(k=i+i,j=i+k;j<=N;j+=k) m[j>>4]&=255-(1<<((j>>1)&7)); i+=2; if (int(a& 2)!=0) for(k=i+i,j=i+k;j<=N;j+=k) m[j>>4]&=255-(1<<((j>>1)&7)); i+=2; if (int(a& 4)!=0) for(k=i+i,j=i+k;j<=N;j+=k) m[j>>4]&=255-(1<<((j>>1)&7)); i+=2; if (int(a& 8)!=0) for(k=i+i,j=i+k;j<=N;j+=k) m[j>>4]&=255-(1<<((j>>1)&7)); i+=2; if (int(a& 16)!=0) for(k=i+i,j=i+k;j<=N;j+=k) m[j>>4]&=255-(1<<((j>>1)&7)); i+=2; if (int(a& 32)!=0) for(k=i+i,j=i+k;j<=N;j+=k) m[j>>4]&=255-(1<<((j>>1)&7)); i+=2; if (int(a& 64)!=0) for(k=i+i,j=i+k;j<=N;j+=k) m[j>>4]&=255-(1<<((j>>1)&7)); i+=2; if (int(a&128)!=0) for(k=i+i,j=i+k;j<=N;j+=k) m[j>>4]&=255-(1<<((j>>1)&7)); i+=2; } // compute primes i=p0&0xFFFFFFF1; k=m[i>>4]; // start after last found prime in list if ((int(k& 1)!=0)&&(i>p0)) primes_i32.add(i); i+=2; if ((int(k& 2)!=0)&&(i>p0)) primes_i32.add(i); i+=2; if ((int(k& 4)!=0)&&(i>p0)) primes_i32.add(i); i+=2; if ((int(k& 8)!=0)&&(i>p0)) primes_i32.add(i); i+=2; if ((int(k& 16)!=0)&&(i>p0)) primes_i32.add(i); i+=2; if ((int(k& 32)!=0)&&(i>p0)) primes_i32.add(i); i+=2; if ((int(k& 64)!=0)&&(i>p0)) primes_i32.add(i); i+=2; if ((int(k&128)!=0)&&(i>p0)) primes_i32.add(i); i+=2; for(j=i>>4;j<M;i+=16,j++) // continue with 16-blocks { k=m[j]; if (!k) continue; if (int(k& 1)!=0) primes_i32.add(i ); if (int(k& 2)!=0) primes_i32.add(i+ 2); if (int(k& 4)!=0) primes_i32.add(i+ 4); if (int(k& 8)!=0) primes_i32.add(i+ 6); if (int(k& 16)!=0) primes_i32.add(i+ 8); if (int(k& 32)!=0) primes_i32.add(i+10); if (int(k& 64)!=0) primes_i32.add(i+12); if (int(k&128)!=0) primes_i32.add(i+14); } k=m[j]; // do the last primes if ((int(k& 1)!=0)&&(i<=p)) primes_i32.add(i); i+=2; if ((int(k& 2)!=0)&&(i<=p)) primes_i32.add(i); i+=2; if ((int(k& 4)!=0)&&(i<=p)) primes_i32.add(i); i+=2; if ((int(k& 8)!=0)&&(i<=p)) primes_i32.add(i); i+=2; if ((int(k& 16)!=0)&&(i<=p)) primes_i32.add(i); i+=2; if ((int(k& 32)!=0)&&(i<=p)) primes_i32.add(i); i+=2; if ((int(k& 64)!=0)&&(i<=p)) primes_i32.add(i); i+=2; if ((int(k&128)!=0)&&(i<=p)) primes_i32.add(i); i+=2; delete[] m; } else{ bool b0=primes_i32_alloc; primes_i32_alloc=true; isprime(p); primes_i32_alloc=false; primes_i32_alloc=b0; } } //---------------------------------------------------------------------------
- 解决了我的代码中的一些溢出错误(筛的周期…)
- 还有一些进一步的优化
- 添加了
getprimes(p)
函数,它将所有primes<=p
快速添加到列表中,因为如果它们还不存在 - testing了第一个1 000 000个素数的正确性(最高达15 485 863)
-
getprimes(15 485 863)
在175.563 ms解决了我的设置 -
isprime
对于这个粗糙的方法是比较慢的 -
primes_i32
是一个int
s的dynamic列表 -
primes_i32.num
是列表中int
的个数 -
primes_i32[i]
是第i
个int i = <0,primes_i32.num-1>
-
primes_i32.add(x)
将x
添加到列表的末尾 -
primes_i32.allocate(N)
为列表中的N
项目预分配空间以避免重新分配减速
[笔记]
我用欧拉问题10(2000000以下的所有素数之和)的这个非并行版本,
---------------------------------------------------------------------------------- Time ID Reference | My solution | Note ---------------------------------------------------------------------------------- [ 35.639 ms] Problem010. 142913828922 | 142913828922 | 64_bit
- 每个筛标签都是
primes_map[]
数组中的一个位片,只有奇数值被使用(甚至不需要记住筛子)。 - 如果你想最大化所有find的素数的速度,只需调用
isprime(max value)
并读取primes_i32[]
的内容 - 我使用一半的位大小而不是sqrt来提高速度
希望我没有忘记在这里复制一些东西